#1:Reading the data set

diabetic_largedata<-read.csv("diabetic_data.csv",header = T, sep=",",na.strings = "?",stringsAsFactors=T)
head(diabetic_largedata)
##   encounter_id patient_nbr            race gender     age weight
## 1      2278392     8222157       Caucasian Female  [0-10)   <NA>
## 2       149190    55629189       Caucasian Female [10-20)   <NA>
## 3        64410    86047875 AfricanAmerican Female [20-30)   <NA>
## 4       500364    82442376       Caucasian   Male [30-40)   <NA>
## 5        16680    42519267       Caucasian   Male [40-50)   <NA>
## 6        35754    82637451       Caucasian   Male [50-60)   <NA>
##   admission_type_id discharge_disposition_id admission_source_id
## 1                 6                       25                   1
## 2                 1                        1                   7
## 3                 1                        1                   7
## 4                 1                        1                   7
## 5                 1                        1                   7
## 6                 2                        1                   2
##   time_in_hospital payer_code        medical_specialty num_lab_procedures
## 1                1       <NA> Pediatrics-Endocrinology                 41
## 2                3       <NA>                     <NA>                 59
## 3                2       <NA>                     <NA>                 11
## 4                2       <NA>                     <NA>                 44
## 5                1       <NA>                     <NA>                 51
## 6                3       <NA>                     <NA>                 31
##   num_procedures num_medications number_outpatient number_emergency
## 1              0               1                 0                0
## 2              0              18                 0                0
## 3              5              13                 2                0
## 4              1              16                 0                0
## 5              0               8                 0                0
## 6              6              16                 0                0
##   number_inpatient diag_1 diag_2 diag_3 number_diagnoses max_glu_serum
## 1                0 250.83   <NA>   <NA>                1          None
## 2                0    276 250.01    255                9          None
## 3                1    648    250    V27                6          None
## 4                0      8 250.43    403                7          None
## 5                0    197    157    250                5          None
## 6                0    414    411    250                9          None
##   A1Cresult metformin repaglinide nateglinide chlorpropamide glimepiride
## 1      None        No          No          No             No          No
## 2      None        No          No          No             No          No
## 3      None        No          No          No             No          No
## 4      None        No          No          No             No          No
## 5      None        No          No          No             No          No
## 6      None        No          No          No             No          No
##   acetohexamide glipizide glyburide tolbutamide pioglitazone rosiglitazone
## 1            No        No        No          No           No            No
## 2            No        No        No          No           No            No
## 3            No    Steady        No          No           No            No
## 4            No        No        No          No           No            No
## 5            No    Steady        No          No           No            No
## 6            No        No        No          No           No            No
##   acarbose miglitol troglitazone tolazamide examide citoglipton insulin
## 1       No       No           No         No      No          No      No
## 2       No       No           No         No      No          No      Up
## 3       No       No           No         No      No          No      No
## 4       No       No           No         No      No          No      Up
## 5       No       No           No         No      No          No  Steady
## 6       No       No           No         No      No          No  Steady
##   glyburide.metformin glipizide.metformin glimepiride.pioglitazone
## 1                  No                  No                       No
## 2                  No                  No                       No
## 3                  No                  No                       No
## 4                  No                  No                       No
## 5                  No                  No                       No
## 6                  No                  No                       No
##   metformin.rosiglitazone metformin.pioglitazone change diabetesMed readmitted
## 1                      No                     No     No          No         NO
## 2                      No                     No     Ch         Yes        >30
## 3                      No                     No     No         Yes         NO
## 4                      No                     No     Ch         Yes         NO
## 5                      No                     No     Ch         Yes         NO
## 6                      No                     No     No         Yes        >30

**EDA

plot(sapply(diabetic_largedata,function(x)sum(is.na(x))))

sapply(diabetic_largedata,function(x)sum(is.na(x)))
##             encounter_id              patient_nbr                     race 
##                        0                        0                     2273 
##                   gender                      age                   weight 
##                        0                        0                    98569 
##        admission_type_id discharge_disposition_id      admission_source_id 
##                        0                        0                        0 
##         time_in_hospital               payer_code        medical_specialty 
##                        0                    40256                    49949 
##       num_lab_procedures           num_procedures          num_medications 
##                        0                        0                        0 
##        number_outpatient         number_emergency         number_inpatient 
##                        0                        0                        0 
##                   diag_1                   diag_2                   diag_3 
##                       21                      358                     1423 
##         number_diagnoses            max_glu_serum                A1Cresult 
##                        0                        0                        0 
##                metformin              repaglinide              nateglinide 
##                        0                        0                        0 
##           chlorpropamide              glimepiride            acetohexamide 
##                        0                        0                        0 
##                glipizide                glyburide              tolbutamide 
##                        0                        0                        0 
##             pioglitazone            rosiglitazone                 acarbose 
##                        0                        0                        0 
##                 miglitol             troglitazone               tolazamide 
##                        0                        0                        0 
##                  examide              citoglipton                  insulin 
##                        0                        0                        0 
##      glyburide.metformin      glipizide.metformin glimepiride.pioglitazone 
##                        0                        0                        0 
##  metformin.rosiglitazone   metformin.pioglitazone                   change 
##                        0                        0                        0 
##              diabetesMed               readmitted 
##                        0                        0
str(diabetic_largedata)
## 'data.frame':    101766 obs. of  50 variables:
##  $ encounter_id            : int  2278392 149190 64410 500364 16680 35754 55842 63768 12522 15738 ...
##  $ patient_nbr             : int  8222157 55629189 86047875 82442376 42519267 82637451 84259809 114882984 48330783 63555939 ...
##  $ race                    : Factor w/ 5 levels "AfricanAmerican",..: 3 3 1 3 3 3 3 3 3 3 ...
##  $ gender                  : Factor w/ 3 levels "Female","Male",..: 1 1 1 2 2 2 2 2 1 1 ...
##  $ age                     : Factor w/ 10 levels "[0-10)","[10-20)",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ weight                  : Factor w/ 9 levels "[0-25)","[100-125)",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ admission_type_id       : int  6 1 1 1 1 2 3 1 2 3 ...
##  $ discharge_disposition_id: int  25 1 1 1 1 1 1 1 1 3 ...
##  $ admission_source_id     : int  1 7 7 7 7 2 2 7 4 4 ...
##  $ time_in_hospital        : int  1 3 2 2 1 3 4 5 13 12 ...
##  $ payer_code              : Factor w/ 17 levels "BC","CH","CM",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ medical_specialty       : Factor w/ 72 levels "AllergyandImmunology",..: 38 NA NA NA NA NA NA NA NA 19 ...
##  $ num_lab_procedures      : int  41 59 11 44 51 31 70 73 68 33 ...
##  $ num_procedures          : int  0 0 5 1 0 6 1 0 2 3 ...
##  $ num_medications         : int  1 18 13 16 8 16 21 12 28 18 ...
##  $ number_outpatient       : int  0 0 2 0 0 0 0 0 0 0 ...
##  $ number_emergency        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient        : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ diag_1                  : Factor w/ 716 levels "10","11","110",..: 125 144 455 555 55 264 264 277 253 283 ...
##  $ diag_2                  : Factor w/ 748 levels "11","110","111",..: NA 80 79 98 25 247 247 315 261 47 ...
##  $ diag_3                  : Factor w/ 789 levels "11","110","111",..: NA 122 767 249 87 87 771 87 230 318 ...
##  $ number_diagnoses        : int  1 9 6 7 5 9 7 8 8 8 ...
##  $ max_glu_serum           : Factor w/ 4 levels ">200",">300",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ A1Cresult               : Factor w/ 4 levels ">7",">8","None",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ metformin               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 2 2 ...
##  $ repaglinide             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ nateglinide             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ chlorpropamide          : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ glimepiride             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 2 2 ...
##  $ acetohexamide           : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ glipizide               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 3 2 3 2 2 2 3 2 ...
##  $ glyburide               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 3 2 2 ...
##  $ tolbutamide             : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ pioglitazone            : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ rosiglitazone           : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 3 ...
##  $ acarbose                : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ miglitol                : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ troglitazone            : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ tolazamide              : Factor w/ 3 levels "No","Steady",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ examide                 : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
##  $ citoglipton             : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
##  $ insulin                 : Factor w/ 4 levels "Down","No","Steady",..: 2 4 2 4 3 3 3 2 3 3 ...
##  $ glyburide.metformin     : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ glipizide.metformin     : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ glimepiride.pioglitazone: Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ metformin.rosiglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ metformin.pioglitazone  : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ change                  : Factor w/ 2 levels "Ch","No": 2 1 2 1 1 2 1 2 1 1 ...
##  $ diabetesMed             : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 2 2 2 ...
##  $ readmitted              : Factor w/ 3 levels "<30",">30","NO": 3 2 3 3 3 2 3 2 3 3 ...

#2:Checking the Missing values

library(DataExplorer)
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
plot_missing(diabetic_largedata)

md.pattern(diabetic_largedata,plot = TRUE, rotate.names = TRUE)

##       encounter_id patient_nbr gender age admission_type_id
## 1043             1           1      1   1                 1
## 25712            1           1      1   1                 1
## 1177             1           1      1   1                 1
## 31197            1           1      1   1                 1
## 513              1           1      1   1                 1
## 22467            1           1      1   1                 1
## 303              1           1      1   1                 1
## 15641            1           1      1   1                 1
## 42               1           1      1   1                 1
## 681              1           1      1   1                 1
## 42               1           1      1   1                 1
## 895              1           1      1   1                 1
## 33               1           1      1   1                 1
## 271              1           1      1   1                 1
## 18               1           1      1   1                 1
## 209              1           1      1   1                 1
## 5                1           1      1   1                 1
## 268              1           1      1   1                 1
## 2                1           1      1   1                 1
## 192              1           1      1   1                 1
## 1                1           1      1   1                 1
## 467              1           1      1   1                 1
## 4                1           1      1   1                 1
## 147              1           1      1   1                 1
## 34               1           1      1   1                 1
## 14               1           1      1   1                 1
## 1                1           1      1   1                 1
## 7                1           1      1   1                 1
## 1                1           1      1   1                 1
## 1                1           1      1   1                 1
## 39               1           1      1   1                 1
## 29               1           1      1   1                 1
## 1                1           1      1   1                 1
## 2                1           1      1   1                 1
## 1                1           1      1   1                 1
## 1                1           1      1   1                 1
## 6                1           1      1   1                 1
## 3                1           1      1   1                 1
## 65               1           1      1   1                 1
## 39               1           1      1   1                 1
## 3                1           1      1   1                 1
## 127              1           1      1   1                 1
## 3                1           1      1   1                 1
## 22               1           1      1   1                 1
## 1                1           1      1   1                 1
## 11               1           1      1   1                 1
## 3                1           1      1   1                 1
## 1                1           1      1   1                 1
## 5                1           1      1   1                 1
## 5                1           1      1   1                 1
## 5                1           1      1   1                 1
## 3                1           1      1   1                 1
## 2                1           1      1   1                 1
## 1                1           1      1   1                 1
##                  0           0      0   0                 0
##       discharge_disposition_id admission_source_id time_in_hospital
## 1043                         1                   1                1
## 25712                        1                   1                1
## 1177                         1                   1                1
## 31197                        1                   1                1
## 513                          1                   1                1
## 22467                        1                   1                1
## 303                          1                   1                1
## 15641                        1                   1                1
## 42                           1                   1                1
## 681                          1                   1                1
## 42                           1                   1                1
## 895                          1                   1                1
## 33                           1                   1                1
## 271                          1                   1                1
## 18                           1                   1                1
## 209                          1                   1                1
## 5                            1                   1                1
## 268                          1                   1                1
## 2                            1                   1                1
## 192                          1                   1                1
## 1                            1                   1                1
## 467                          1                   1                1
## 4                            1                   1                1
## 147                          1                   1                1
## 34                           1                   1                1
## 14                           1                   1                1
## 1                            1                   1                1
## 7                            1                   1                1
## 1                            1                   1                1
## 1                            1                   1                1
## 39                           1                   1                1
## 29                           1                   1                1
## 1                            1                   1                1
## 2                            1                   1                1
## 1                            1                   1                1
## 1                            1                   1                1
## 6                            1                   1                1
## 3                            1                   1                1
## 65                           1                   1                1
## 39                           1                   1                1
## 3                            1                   1                1
## 127                          1                   1                1
## 3                            1                   1                1
## 22                           1                   1                1
## 1                            1                   1                1
## 11                           1                   1                1
## 3                            1                   1                1
## 1                            1                   1                1
## 5                            1                   1                1
## 5                            1                   1                1
## 5                            1                   1                1
## 3                            1                   1                1
## 2                            1                   1                1
## 1                            1                   1                1
##                              0                   0                0
##       num_lab_procedures num_procedures num_medications number_outpatient
## 1043                   1              1               1                 1
## 25712                  1              1               1                 1
## 1177                   1              1               1                 1
## 31197                  1              1               1                 1
## 513                    1              1               1                 1
## 22467                  1              1               1                 1
## 303                    1              1               1                 1
## 15641                  1              1               1                 1
## 42                     1              1               1                 1
## 681                    1              1               1                 1
## 42                     1              1               1                 1
## 895                    1              1               1                 1
## 33                     1              1               1                 1
## 271                    1              1               1                 1
## 18                     1              1               1                 1
## 209                    1              1               1                 1
## 5                      1              1               1                 1
## 268                    1              1               1                 1
## 2                      1              1               1                 1
## 192                    1              1               1                 1
## 1                      1              1               1                 1
## 467                    1              1               1                 1
## 4                      1              1               1                 1
## 147                    1              1               1                 1
## 34                     1              1               1                 1
## 14                     1              1               1                 1
## 1                      1              1               1                 1
## 7                      1              1               1                 1
## 1                      1              1               1                 1
## 1                      1              1               1                 1
## 39                     1              1               1                 1
## 29                     1              1               1                 1
## 1                      1              1               1                 1
## 2                      1              1               1                 1
## 1                      1              1               1                 1
## 1                      1              1               1                 1
## 6                      1              1               1                 1
## 3                      1              1               1                 1
## 65                     1              1               1                 1
## 39                     1              1               1                 1
## 3                      1              1               1                 1
## 127                    1              1               1                 1
## 3                      1              1               1                 1
## 22                     1              1               1                 1
## 1                      1              1               1                 1
## 11                     1              1               1                 1
## 3                      1              1               1                 1
## 1                      1              1               1                 1
## 5                      1              1               1                 1
## 5                      1              1               1                 1
## 5                      1              1               1                 1
## 3                      1              1               1                 1
## 2                      1              1               1                 1
## 1                      1              1               1                 1
##                        0              0               0                 0
##       number_emergency number_inpatient number_diagnoses max_glu_serum
## 1043                 1                1                1             1
## 25712                1                1                1             1
## 1177                 1                1                1             1
## 31197                1                1                1             1
## 513                  1                1                1             1
## 22467                1                1                1             1
## 303                  1                1                1             1
## 15641                1                1                1             1
## 42                   1                1                1             1
## 681                  1                1                1             1
## 42                   1                1                1             1
## 895                  1                1                1             1
## 33                   1                1                1             1
## 271                  1                1                1             1
## 18                   1                1                1             1
## 209                  1                1                1             1
## 5                    1                1                1             1
## 268                  1                1                1             1
## 2                    1                1                1             1
## 192                  1                1                1             1
## 1                    1                1                1             1
## 467                  1                1                1             1
## 4                    1                1                1             1
## 147                  1                1                1             1
## 34                   1                1                1             1
## 14                   1                1                1             1
## 1                    1                1                1             1
## 7                    1                1                1             1
## 1                    1                1                1             1
## 1                    1                1                1             1
## 39                   1                1                1             1
## 29                   1                1                1             1
## 1                    1                1                1             1
## 2                    1                1                1             1
## 1                    1                1                1             1
## 1                    1                1                1             1
## 6                    1                1                1             1
## 3                    1                1                1             1
## 65                   1                1                1             1
## 39                   1                1                1             1
## 3                    1                1                1             1
## 127                  1                1                1             1
## 3                    1                1                1             1
## 22                   1                1                1             1
## 1                    1                1                1             1
## 11                   1                1                1             1
## 3                    1                1                1             1
## 1                    1                1                1             1
## 5                    1                1                1             1
## 5                    1                1                1             1
## 5                    1                1                1             1
## 3                    1                1                1             1
## 2                    1                1                1             1
## 1                    1                1                1             1
##                      0                0                0             0
##       A1Cresult metformin repaglinide nateglinide chlorpropamide glimepiride
## 1043          1         1           1           1              1           1
## 25712         1         1           1           1              1           1
## 1177          1         1           1           1              1           1
## 31197         1         1           1           1              1           1
## 513           1         1           1           1              1           1
## 22467         1         1           1           1              1           1
## 303           1         1           1           1              1           1
## 15641         1         1           1           1              1           1
## 42            1         1           1           1              1           1
## 681           1         1           1           1              1           1
## 42            1         1           1           1              1           1
## 895           1         1           1           1              1           1
## 33            1         1           1           1              1           1
## 271           1         1           1           1              1           1
## 18            1         1           1           1              1           1
## 209           1         1           1           1              1           1
## 5             1         1           1           1              1           1
## 268           1         1           1           1              1           1
## 2             1         1           1           1              1           1
## 192           1         1           1           1              1           1
## 1             1         1           1           1              1           1
## 467           1         1           1           1              1           1
## 4             1         1           1           1              1           1
## 147           1         1           1           1              1           1
## 34            1         1           1           1              1           1
## 14            1         1           1           1              1           1
## 1             1         1           1           1              1           1
## 7             1         1           1           1              1           1
## 1             1         1           1           1              1           1
## 1             1         1           1           1              1           1
## 39            1         1           1           1              1           1
## 29            1         1           1           1              1           1
## 1             1         1           1           1              1           1
## 2             1         1           1           1              1           1
## 1             1         1           1           1              1           1
## 1             1         1           1           1              1           1
## 6             1         1           1           1              1           1
## 3             1         1           1           1              1           1
## 65            1         1           1           1              1           1
## 39            1         1           1           1              1           1
## 3             1         1           1           1              1           1
## 127           1         1           1           1              1           1
## 3             1         1           1           1              1           1
## 22            1         1           1           1              1           1
## 1             1         1           1           1              1           1
## 11            1         1           1           1              1           1
## 3             1         1           1           1              1           1
## 1             1         1           1           1              1           1
## 5             1         1           1           1              1           1
## 5             1         1           1           1              1           1
## 5             1         1           1           1              1           1
## 3             1         1           1           1              1           1
## 2             1         1           1           1              1           1
## 1             1         1           1           1              1           1
##               0         0           0           0              0           0
##       acetohexamide glipizide glyburide tolbutamide pioglitazone rosiglitazone
## 1043              1         1         1           1            1             1
## 25712             1         1         1           1            1             1
## 1177              1         1         1           1            1             1
## 31197             1         1         1           1            1             1
## 513               1         1         1           1            1             1
## 22467             1         1         1           1            1             1
## 303               1         1         1           1            1             1
## 15641             1         1         1           1            1             1
## 42                1         1         1           1            1             1
## 681               1         1         1           1            1             1
## 42                1         1         1           1            1             1
## 895               1         1         1           1            1             1
## 33                1         1         1           1            1             1
## 271               1         1         1           1            1             1
## 18                1         1         1           1            1             1
## 209               1         1         1           1            1             1
## 5                 1         1         1           1            1             1
## 268               1         1         1           1            1             1
## 2                 1         1         1           1            1             1
## 192               1         1         1           1            1             1
## 1                 1         1         1           1            1             1
## 467               1         1         1           1            1             1
## 4                 1         1         1           1            1             1
## 147               1         1         1           1            1             1
## 34                1         1         1           1            1             1
## 14                1         1         1           1            1             1
## 1                 1         1         1           1            1             1
## 7                 1         1         1           1            1             1
## 1                 1         1         1           1            1             1
## 1                 1         1         1           1            1             1
## 39                1         1         1           1            1             1
## 29                1         1         1           1            1             1
## 1                 1         1         1           1            1             1
## 2                 1         1         1           1            1             1
## 1                 1         1         1           1            1             1
## 1                 1         1         1           1            1             1
## 6                 1         1         1           1            1             1
## 3                 1         1         1           1            1             1
## 65                1         1         1           1            1             1
## 39                1         1         1           1            1             1
## 3                 1         1         1           1            1             1
## 127               1         1         1           1            1             1
## 3                 1         1         1           1            1             1
## 22                1         1         1           1            1             1
## 1                 1         1         1           1            1             1
## 11                1         1         1           1            1             1
## 3                 1         1         1           1            1             1
## 1                 1         1         1           1            1             1
## 5                 1         1         1           1            1             1
## 5                 1         1         1           1            1             1
## 5                 1         1         1           1            1             1
## 3                 1         1         1           1            1             1
## 2                 1         1         1           1            1             1
## 1                 1         1         1           1            1             1
##                   0         0         0           0            0             0
##       acarbose miglitol troglitazone tolazamide examide citoglipton insulin
## 1043         1        1            1          1       1           1       1
## 25712        1        1            1          1       1           1       1
## 1177         1        1            1          1       1           1       1
## 31197        1        1            1          1       1           1       1
## 513          1        1            1          1       1           1       1
## 22467        1        1            1          1       1           1       1
## 303          1        1            1          1       1           1       1
## 15641        1        1            1          1       1           1       1
## 42           1        1            1          1       1           1       1
## 681          1        1            1          1       1           1       1
## 42           1        1            1          1       1           1       1
## 895          1        1            1          1       1           1       1
## 33           1        1            1          1       1           1       1
## 271          1        1            1          1       1           1       1
## 18           1        1            1          1       1           1       1
## 209          1        1            1          1       1           1       1
## 5            1        1            1          1       1           1       1
## 268          1        1            1          1       1           1       1
## 2            1        1            1          1       1           1       1
## 192          1        1            1          1       1           1       1
## 1            1        1            1          1       1           1       1
## 467          1        1            1          1       1           1       1
## 4            1        1            1          1       1           1       1
## 147          1        1            1          1       1           1       1
## 34           1        1            1          1       1           1       1
## 14           1        1            1          1       1           1       1
## 1            1        1            1          1       1           1       1
## 7            1        1            1          1       1           1       1
## 1            1        1            1          1       1           1       1
## 1            1        1            1          1       1           1       1
## 39           1        1            1          1       1           1       1
## 29           1        1            1          1       1           1       1
## 1            1        1            1          1       1           1       1
## 2            1        1            1          1       1           1       1
## 1            1        1            1          1       1           1       1
## 1            1        1            1          1       1           1       1
## 6            1        1            1          1       1           1       1
## 3            1        1            1          1       1           1       1
## 65           1        1            1          1       1           1       1
## 39           1        1            1          1       1           1       1
## 3            1        1            1          1       1           1       1
## 127          1        1            1          1       1           1       1
## 3            1        1            1          1       1           1       1
## 22           1        1            1          1       1           1       1
## 1            1        1            1          1       1           1       1
## 11           1        1            1          1       1           1       1
## 3            1        1            1          1       1           1       1
## 1            1        1            1          1       1           1       1
## 5            1        1            1          1       1           1       1
## 5            1        1            1          1       1           1       1
## 5            1        1            1          1       1           1       1
## 3            1        1            1          1       1           1       1
## 2            1        1            1          1       1           1       1
## 1            1        1            1          1       1           1       1
##              0        0            0          0       0           0       0
##       glyburide.metformin glipizide.metformin glimepiride.pioglitazone
## 1043                    1                   1                        1
## 25712                   1                   1                        1
## 1177                    1                   1                        1
## 31197                   1                   1                        1
## 513                     1                   1                        1
## 22467                   1                   1                        1
## 303                     1                   1                        1
## 15641                   1                   1                        1
## 42                      1                   1                        1
## 681                     1                   1                        1
## 42                      1                   1                        1
## 895                     1                   1                        1
## 33                      1                   1                        1
## 271                     1                   1                        1
## 18                      1                   1                        1
## 209                     1                   1                        1
## 5                       1                   1                        1
## 268                     1                   1                        1
## 2                       1                   1                        1
## 192                     1                   1                        1
## 1                       1                   1                        1
## 467                     1                   1                        1
## 4                       1                   1                        1
## 147                     1                   1                        1
## 34                      1                   1                        1
## 14                      1                   1                        1
## 1                       1                   1                        1
## 7                       1                   1                        1
## 1                       1                   1                        1
## 1                       1                   1                        1
## 39                      1                   1                        1
## 29                      1                   1                        1
## 1                       1                   1                        1
## 2                       1                   1                        1
## 1                       1                   1                        1
## 1                       1                   1                        1
## 6                       1                   1                        1
## 3                       1                   1                        1
## 65                      1                   1                        1
## 39                      1                   1                        1
## 3                       1                   1                        1
## 127                     1                   1                        1
## 3                       1                   1                        1
## 22                      1                   1                        1
## 1                       1                   1                        1
## 11                      1                   1                        1
## 3                       1                   1                        1
## 1                       1                   1                        1
## 5                       1                   1                        1
## 5                       1                   1                        1
## 5                       1                   1                        1
## 3                       1                   1                        1
## 2                       1                   1                        1
## 1                       1                   1                        1
##                         0                   0                        0
##       metformin.rosiglitazone metformin.pioglitazone change diabetesMed
## 1043                        1                      1      1           1
## 25712                       1                      1      1           1
## 1177                        1                      1      1           1
## 31197                       1                      1      1           1
## 513                         1                      1      1           1
## 22467                       1                      1      1           1
## 303                         1                      1      1           1
## 15641                       1                      1      1           1
## 42                          1                      1      1           1
## 681                         1                      1      1           1
## 42                          1                      1      1           1
## 895                         1                      1      1           1
## 33                          1                      1      1           1
## 271                         1                      1      1           1
## 18                          1                      1      1           1
## 209                         1                      1      1           1
## 5                           1                      1      1           1
## 268                         1                      1      1           1
## 2                           1                      1      1           1
## 192                         1                      1      1           1
## 1                           1                      1      1           1
## 467                         1                      1      1           1
## 4                           1                      1      1           1
## 147                         1                      1      1           1
## 34                          1                      1      1           1
## 14                          1                      1      1           1
## 1                           1                      1      1           1
## 7                           1                      1      1           1
## 1                           1                      1      1           1
## 1                           1                      1      1           1
## 39                          1                      1      1           1
## 29                          1                      1      1           1
## 1                           1                      1      1           1
## 2                           1                      1      1           1
## 1                           1                      1      1           1
## 1                           1                      1      1           1
## 6                           1                      1      1           1
## 3                           1                      1      1           1
## 65                          1                      1      1           1
## 39                          1                      1      1           1
## 3                           1                      1      1           1
## 127                         1                      1      1           1
## 3                           1                      1      1           1
## 22                          1                      1      1           1
## 1                           1                      1      1           1
## 11                          1                      1      1           1
## 3                           1                      1      1           1
## 1                           1                      1      1           1
## 5                           1                      1      1           1
## 5                           1                      1      1           1
## 5                           1                      1      1           1
## 3                           1                      1      1           1
## 2                           1                      1      1           1
## 1                           1                      1      1           1
##                             0                      0      0           0
##       readmitted diag_1 diag_2 diag_3 race payer_code medical_specialty weight
## 1043           1      1      1      1    1          1                 1      1
## 25712          1      1      1      1    1          1                 1      0
## 1177           1      1      1      1    1          1                 0      1
## 31197          1      1      1      1    1          1                 0      0
## 513            1      1      1      1    1          0                 1      1
## 22467          1      1      1      1    1          0                 1      0
## 303            1      1      1      1    1          0                 0      1
## 15641          1      1      1      1    1          0                 0      0
## 42             1      1      1      1    0          1                 1      1
## 681            1      1      1      1    0          1                 1      0
## 42             1      1      1      1    0          1                 0      1
## 895            1      1      1      1    0          1                 0      0
## 33             1      1      1      1    0          0                 1      1
## 271            1      1      1      1    0          0                 1      0
## 18             1      1      1      1    0          0                 0      1
## 209            1      1      1      1    0          0                 0      0
## 5              1      1      1      0    1          1                 1      1
## 268            1      1      1      0    1          1                 1      0
## 2              1      1      1      0    1          1                 0      1
## 192            1      1      1      0    1          1                 0      0
## 1              1      1      1      0    1          0                 1      1
## 467            1      1      1      0    1          0                 1      0
## 4              1      1      1      0    1          0                 0      1
## 147            1      1      1      0    1          0                 0      0
## 34             1      1      1      0    0          1                 1      0
## 14             1      1      1      0    0          1                 0      0
## 1              1      1      1      0    0          0                 1      1
## 7              1      1      1      0    0          0                 1      0
## 1              1      1      1      0    0          0                 0      1
## 1              1      1      1      0    0          0                 0      0
## 39             1      1      0      1    1          1                 1      0
## 29             1      1      0      1    1          1                 0      0
## 1              1      1      0      1    1          0                 1      1
## 2              1      1      0      1    1          0                 1      0
## 1              1      1      0      1    1          0                 0      1
## 1              1      1      0      1    1          0                 0      0
## 6              1      1      0      1    0          1                 1      0
## 3              1      1      0      0    1          1                 1      1
## 65             1      1      0      0    1          1                 1      0
## 39             1      1      0      0    1          1                 0      0
## 3              1      1      0      0    1          0                 1      1
## 127            1      1      0      0    1          0                 1      0
## 3              1      1      0      0    1          0                 0      1
## 22             1      1      0      0    1          0                 0      0
## 1              1      1      0      0    0          1                 1      1
## 11             1      1      0      0    0          1                 1      0
## 3              1      1      0      0    0          1                 0      0
## 1              1      1      0      0    0          0                 1      0
## 5              1      0      1      1    1          1                 1      0
## 5              1      0      1      1    1          1                 0      0
## 5              1      0      1      1    1          0                 1      0
## 3              1      0      1      1    1          0                 0      0
## 2              1      0      1      1    0          0                 1      0
## 1              1      0      0      0    1          0                 1      0
##                0     21    358   1423 2273      40256             49949  98569
##             
## 1043       0
## 25712      1
## 1177       1
## 31197      2
## 513        1
## 22467      2
## 303        2
## 15641      3
## 42         1
## 681        2
## 42         2
## 895        3
## 33         2
## 271        3
## 18         3
## 209        4
## 5          1
## 268        2
## 2          2
## 192        3
## 1          2
## 467        3
## 4          3
## 147        4
## 34         3
## 14         4
## 1          3
## 7          4
## 1          4
## 1          5
## 39         2
## 29         3
## 1          2
## 2          3
## 1          3
## 1          4
## 6          3
## 3          2
## 65         3
## 39         4
## 3          3
## 127        4
## 3          4
## 22         5
## 1          3
## 11         4
## 3          5
## 1          5
## 5          2
## 5          3
## 5          3
## 3          4
## 2          4
## 1          5
##       192849

#3:First Drop

#removing the most missed variables and irrelevant variables(encounter id,patient nbr,weight and payer_code(insurance) and medical speciality, and )
diabetic_largedata<-diabetic_largedata[,-c(1,2,6,11,12)]
str(diabetic_largedata)
## 'data.frame':    101766 obs. of  45 variables:
##  $ race                    : Factor w/ 5 levels "AfricanAmerican",..: 3 3 1 3 3 3 3 3 3 3 ...
##  $ gender                  : Factor w/ 3 levels "Female","Male",..: 1 1 1 2 2 2 2 2 1 1 ...
##  $ age                     : Factor w/ 10 levels "[0-10)","[10-20)",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ admission_type_id       : int  6 1 1 1 1 2 3 1 2 3 ...
##  $ discharge_disposition_id: int  25 1 1 1 1 1 1 1 1 3 ...
##  $ admission_source_id     : int  1 7 7 7 7 2 2 7 4 4 ...
##  $ time_in_hospital        : int  1 3 2 2 1 3 4 5 13 12 ...
##  $ num_lab_procedures      : int  41 59 11 44 51 31 70 73 68 33 ...
##  $ num_procedures          : int  0 0 5 1 0 6 1 0 2 3 ...
##  $ num_medications         : int  1 18 13 16 8 16 21 12 28 18 ...
##  $ number_outpatient       : int  0 0 2 0 0 0 0 0 0 0 ...
##  $ number_emergency        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient        : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ diag_1                  : Factor w/ 716 levels "10","11","110",..: 125 144 455 555 55 264 264 277 253 283 ...
##  $ diag_2                  : Factor w/ 748 levels "11","110","111",..: NA 80 79 98 25 247 247 315 261 47 ...
##  $ diag_3                  : Factor w/ 789 levels "11","110","111",..: NA 122 767 249 87 87 771 87 230 318 ...
##  $ number_diagnoses        : int  1 9 6 7 5 9 7 8 8 8 ...
##  $ max_glu_serum           : Factor w/ 4 levels ">200",">300",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ A1Cresult               : Factor w/ 4 levels ">7",">8","None",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ metformin               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 2 2 ...
##  $ repaglinide             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ nateglinide             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ chlorpropamide          : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ glimepiride             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 2 2 ...
##  $ acetohexamide           : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ glipizide               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 3 2 3 2 2 2 3 2 ...
##  $ glyburide               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 3 2 2 ...
##  $ tolbutamide             : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ pioglitazone            : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ rosiglitazone           : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 3 ...
##  $ acarbose                : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ miglitol                : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ troglitazone            : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ tolazamide              : Factor w/ 3 levels "No","Steady",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ examide                 : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
##  $ citoglipton             : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
##  $ insulin                 : Factor w/ 4 levels "Down","No","Steady",..: 2 4 2 4 3 3 3 2 3 3 ...
##  $ glyburide.metformin     : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ glipizide.metformin     : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ glimepiride.pioglitazone: Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ metformin.rosiglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ metformin.pioglitazone  : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ change                  : Factor w/ 2 levels "Ch","No": 2 1 2 1 1 2 1 2 1 1 ...
##  $ diabetesMed             : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 2 2 2 ...
##  $ readmitted              : Factor w/ 3 levels "<30",">30","NO": 3 2 3 3 3 2 3 2 3 3 ...
#just check if we have any question mark
plot(sapply(diabetic_largedata,function(x)sum(grepl("\\?",x))))

plot(sapply(diabetic_largedata,function(x)sum(grepl(" Unknown/Invalid",x))))

#many of them are diabetes medication mybe we can group all of them together (group all the medication together) or we can just keep  diabetesMed 
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(diabetic_largedata,main="NA's Precentage",col=c("black","pink"))

##4:Groupimg the Outcome variable

#here we have 3 classes for outcome varible(readmited) make it to 2 classes 
table(diabetic_largedata$readmitted)
## 
##   <30   >30    NO 
## 11357 35545 54864
#the patient who come after 30 days it is considered as "YES"  if he come back before 30 days the hospital should pay all(reimbersment) so they want to predict the ones that are ptential to come bac to prevent than and don't discharge them soon !
diabetic_largedata$readmitted<-ifelse(diabetic_largedata$readmitted==">30"|diabetic_largedata$readmitted=="<30","YES","NO")
table(diabetic_largedata$readmitted)
## 
##    NO   YES 
## 54864 46902
table(diabetic_largedata$A1Cresult)
## 
##    >7    >8  None  Norm 
##  3812  8216 84748  4990
table(diabetic_largedata$max_glu_serum)
## 
##  >200  >300  None  Norm 
##  1485  1264 96420  2597
diabetic_largedata<-subset(diabetic_largedata,max_glu_serum!="None",)
diabetic_largedata<-subset(diabetic_largedata,A1Cresult!="None",)
diabetic_largedata<-droplevels(diabetic_largedata)
plot(table(diabetic_largedata$A1Cresult))

plot(table(diabetic_largedata$max_glu_serum))

nrow(diabetic_largedata)
## [1] 298

##Checking imbalanceness of Outcome

#install.packages("lessR")
library(lessR)
## 
## lessR 4.2.8                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")   Read text, Excel, SPSS, SAS, or R data file
##   d is default data frame, data= in analysis routines optional
## 
## Learn about reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables.
##   Enter:  browseVignettes("lessR")
## 
## View changes in this and recent versions of lessR.
##   Enter: news(package="lessR")
## 
## Interactive data analysis.
##   Enter: interact()
PieChart(data=diabetic_largedata,readmitted, fill = c("orange", "blue"), main = "Class distributionoof readmitted",values = "%")

## >>> suggestions
## PieChart(readmitted, hole=0)  # traditional pie chart
## PieChart(readmitted, values="%")  # display %'s on the chart
## PieChart(readmitted)  # bar chart
## Plot(readmitted)  # bubble plot
## Plot(readmitted, values="count")  # lollipop plot 
## 
## --- readmitted --- 
## 
##                   NO    YES     Total 
## Frequencies:     123    175       298 
## Proportions:   0.413  0.587     1.000 
## 
## Chi-squared test of null hypothesis of equal probabilities 
##   Chisq = 9.074, df = 1, p-value = 0.003

#Second drop

#install.packages("Amelia")
library(naniar)
missmap(diabetic_largedata)

gg_miss_var(diabetic_largedata)

gg_miss_upset(diabetic_largedata)

#around 2% of the data can be omitted
#dropping the rows that contain NAs
diabetic_largedata<-na.omit(diabetic_largedata)
nrow(diabetic_largedata)
## [1] 289
plot_missing(diabetic_largedata)

#Checking correlations of categorical

#using polycor for getting the correlation of some variables with categorical variables 
library(polycor)
#correlation of admission_type_id,discharge_disposition_id,admission_source_id with the outcome

correlationtable <- data.frame(
  variable = c("admission_source_id", "discharge_disposition_id", "admission_type_id","diag1","diag2","diag3"),
  readmitted = c(
    polychor(diabetic_largedata$admission_source_id, diabetic_largedata$readmitted),
    polychor(diabetic_largedata$discharge_disposition_id, diabetic_largedata$readmitted),
    polychor(diabetic_largedata$admission_type_id, diabetic_largedata$readmitted),
    polychor(diabetic_largedata$diag_1,diabetic_largedata$readmitted),
    polychor(diabetic_largedata$diag_2,diabetic_largedata$readmitted),
    polychor(diabetic_largedata$diag_3,diabetic_largedata$readmitted)
  ),
  age=c(polychor(diabetic_largedata$admission_source_id,diabetic_largedata$age),
polychor(diabetic_largedata$discharge_disposition_id,diabetic_largedata$age),
polychor(diabetic_largedata$admission_type_id,diabetic_largedata$age),
polychor(diabetic_largedata$diag_1,diabetic_largedata$age),
polychor(diabetic_largedata$diag_2,diabetic_largedata$age),
polychor(diabetic_largedata$diag_3,diabetic_largedata$age)
)
)

correlationtable
##                   variable  readmitted          age
## 1      admission_source_id -0.23278752 -0.033934736
## 2 discharge_disposition_id  0.05802495  0.237600899
## 3        admission_type_id -0.17096102  0.209122762
## 4                    diag1 -0.04364375  0.005544923
## 5                    diag2 -0.01033573  0.017288829
## 6                    diag3  0.07194527  0.002855034

#Checking corelation of Numeric variables

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lessR':
## 
##     recode, rename
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
numdiab = c()
for (i in 1:ncol(diabetic_largedata)) {
  if (is.numeric(diabetic_largedata[,i])){
    numdiab = append(numdiab, i)
  }
}
numdiab<-diabetic_largedata[,numdiab]
numdiab<-cbind(numdiab,as.numeric(diabetic_largedata$diag_1),as.numeric(diabetic_largedata$diag_2),as.numeric(diabetic_largedata$diag_3))
head(numdiab)
##     admission_type_id discharge_disposition_id admission_source_id
## 163                 6                        3                   7
## 461                 6                        1                   7
## 594                 6                        1                   7
## 697                 6                        6                   7
## 772                 6                        1                   2
## 824                 6                        1                   7
##     time_in_hospital num_lab_procedures num_procedures num_medications
## 163                5                 47              1               6
## 461               10                 72              1              19
## 594                2                 61              0               5
## 697               11                 71              1              20
## 772               14                 43              0              11
## 824                7                105              3              16
##     number_outpatient number_emergency number_inpatient number_diagnoses
## 163                 0                0                0                5
## 461                 0                0                0                5
## 594                 0                0                0                5
## 697                 0                0                0                5
## 772                 0                0                0                3
## 824                 0                0                0                5
##     as.numeric(diabetic_largedata$diag_1) as.numeric(diabetic_largedata$diag_2)
## 163                                    24                                    28
## 461                                     5                                    23
## 594                                    19                                    89
## 697                                    87                                     9
## 772                                    22                                     7
## 824                                    36                                     9
##     as.numeric(diabetic_largedata$diag_3)
## 163                                    46
## 461                                    28
## 594                                     9
## 697                                   100
## 772                                    66
## 824                                    21
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(numdiab),type='upper',tl.srt = 30,method="number",tl.cex = 0.6,bg="gray",title = "Correlation of Numeric Variables")

#admistion type id is highly corelated with some variables but since it is id it should be categorical to prevent over shadow of bigger numbers so I made another data set of literraly numeric varibles 
numdiab2<-diabetic_largedata[, c("time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications",  "number_diagnoses", "number_outpatient", "number_inpatient","number_emergency")]
corrplot(cor(numdiab2),type='upper',method = 'pie')

library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
numdiab2$readmitted<-diabetic_largedata$readmitted
ggpairs(numdiab2, aes(alpha=0.1, color = readmitted))+  theme(plot.title = element_text(size = 16),
        axis.title = element_text(size = 12),
        axis.text = element_text(size = 10),
        strip.text = element_text(size = 12),
        panel.spacing = unit(0.2, "lines"),
        panel.background = element_blank(),
        panel.border = element_rect(color = "black", fill = NA, size = 0.1),
        plot.margin = unit(c(1,1,1,1), "cm"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cbind(colnames(diabetic_largedata))
##       [,1]                      
##  [1,] "race"                    
##  [2,] "gender"                  
##  [3,] "age"                     
##  [4,] "admission_type_id"       
##  [5,] "discharge_disposition_id"
##  [6,] "admission_source_id"     
##  [7,] "time_in_hospital"        
##  [8,] "num_lab_procedures"      
##  [9,] "num_procedures"          
## [10,] "num_medications"         
## [11,] "number_outpatient"       
## [12,] "number_emergency"        
## [13,] "number_inpatient"        
## [14,] "diag_1"                  
## [15,] "diag_2"                  
## [16,] "diag_3"                  
## [17,] "number_diagnoses"        
## [18,] "max_glu_serum"           
## [19,] "A1Cresult"               
## [20,] "metformin"               
## [21,] "repaglinide"             
## [22,] "nateglinide"             
## [23,] "chlorpropamide"          
## [24,] "glimepiride"             
## [25,] "acetohexamide"           
## [26,] "glipizide"               
## [27,] "glyburide"               
## [28,] "tolbutamide"             
## [29,] "pioglitazone"            
## [30,] "rosiglitazone"           
## [31,] "acarbose"                
## [32,] "miglitol"                
## [33,] "troglitazone"            
## [34,] "tolazamide"              
## [35,] "examide"                 
## [36,] "citoglipton"             
## [37,] "insulin"                 
## [38,] "glyburide.metformin"     
## [39,] "glipizide.metformin"     
## [40,] "glimepiride.pioglitazone"
## [41,] "metformin.rosiglitazone" 
## [42,] "metformin.pioglitazone"  
## [43,] "change"                  
## [44,] "diabetesMed"             
## [45,] "readmitted"
#Diabetics medications = 20:42
summary(diabetic_largedata[20:42])
##   metformin   repaglinide  nateglinide  chlorpropamide glimepiride 
##  Down  :  2   No    :278   No    :288   No    :288     No    :282  
##  No    :245   Steady: 10   Steady:  1   Steady:  1     Steady:  6  
##  Steady: 36   Up    :  1                               Up    :  1  
##  Up    :  6                                                        
##  acetohexamide  glipizide    glyburide   tolbutamide pioglitazone rosiglitazone
##  No:289        Down  :  2   No    :274   No:289      Down  :  1   No    :279   
##                No    :243   Steady: 14               No    :276   Steady:  8   
##                Steady: 38   Up    :  1               Steady: 12   Up    :  2   
##                Up    :  6                                                      
##    acarbose   miglitol troglitazone tolazamide examide  citoglipton
##  No    :287   No:289   No:289       No:289     No:289   No:289     
##  Steady:  1                                                        
##  Up    :  1                                                        
##                                                                    
##    insulin    glyburide.metformin glipizide.metformin glimepiride.pioglitazone
##  Down  : 20   No    :288          No:289              No:289                  
##  No    :202   Steady:  1                                                      
##  Steady: 53                                                                   
##  Up    : 14                                                                   
##  metformin.rosiglitazone metformin.pioglitazone
##  No:289                  No:289                
##                                                
##                                                
## 

#Third Drop : Medications

diabetic_largedata<-diabetic_largedata[, -c(20:36,38:42)]
head(diabetic_largedata)
##                race gender     age admission_type_id discharge_disposition_id
## 163       Caucasian   Male [80-90)                 6                        3
## 461 AfricanAmerican Female [70-80)                 6                        1
## 594       Caucasian Female [50-60)                 6                        1
## 697           Other   Male [70-80)                 6                        6
## 772       Caucasian Female [30-40)                 6                        1
## 824       Caucasian   Male [80-90)                 6                        1
##     admission_source_id time_in_hospital num_lab_procedures num_procedures
## 163                   7                5                 47              1
## 461                   7               10                 72              1
## 594                   7                2                 61              0
## 697                   7               11                 71              1
## 772                   2               14                 43              0
## 824                   7                7                105              3
##     num_medications number_outpatient number_emergency number_inpatient diag_1
## 163               6                 0                0                0    332
## 461              19                 0                0                0 250.02
## 594               5                 0                0                0    276
## 697              20                 0                0                0    820
## 772              11                 0                0                0    296
## 824              16                 0                0                0    428
##     diag_2 diag_3 number_diagnoses max_glu_serum A1Cresult insulin change
## 163    294    425                5          >200      Norm      No     No
## 461    276    294                5          >300        >8      Up     Ch
## 594    780 250.03                5          >300        >8  Steady     No
## 697 250.02   E885                5          >200        >7      No     No
## 772    250    564                3          Norm        >7      No     No
## 824 250.02    276                5          >300        >7      No     No
##     diabetesMed readmitted
## 163          No        YES
## 461         Yes        YES
## 594         Yes         NO
## 697         Yes         NO
## 772          No        YES
## 824         Yes        YES
summary(diabetic_largedata)
##               race        gender         age     admission_type_id
##  AfricanAmerican: 51   Female:168   [50-60):65   Min.   :1.000    
##  Asian          :  7   Male  :121   [60-70):60   1st Qu.:6.000    
##  Caucasian      :180                [70-80):60   Median :6.000    
##  Hispanic       : 38                [80-90):41   Mean   :5.024    
##  Other          : 13                [40-50):38   3rd Qu.:6.000    
##                                     [30-40):17   Max.   :6.000    
##                                     (Other): 8                    
##  discharge_disposition_id admission_source_id time_in_hospital
##  Min.   : 1.000           Min.   :1.000       Min.   : 1.000  
##  1st Qu.: 1.000           1st Qu.:7.000       1st Qu.: 3.000  
##  Median : 1.000           Median :7.000       Median : 5.000  
##  Mean   : 2.197           Mean   :6.488       Mean   : 5.398  
##  3rd Qu.: 3.000           3rd Qu.:7.000       3rd Qu.: 7.000  
##  Max.   :13.000           Max.   :7.000       Max.   :14.000  
##                                                               
##  num_lab_procedures num_procedures   num_medications number_outpatient
##  Min.   : 31.0      Min.   :0.0000   Min.   : 1.00   Min.   :0.0000   
##  1st Qu.: 54.0      1st Qu.:0.0000   1st Qu.: 9.00   1st Qu.:0.0000   
##  Median : 63.0      Median :0.0000   Median :14.00   Median :0.0000   
##  Mean   : 64.2      Mean   :0.8443   Mean   :14.54   Mean   :0.1592   
##  3rd Qu.: 74.0      3rd Qu.:1.0000   3rd Qu.:19.00   3rd Qu.:0.0000   
##  Max.   :106.0      Max.   :6.0000   Max.   :35.00   Max.   :6.0000   
##                                                                       
##  number_emergency number_inpatient     diag_1        diag_2        diag_3   
##  Min.   :0.000    Min.   :0.0000   491    : 21   250    : 22   250    : 24  
##  1st Qu.:0.000    1st Qu.:0.0000   428    : 19   250.02 : 19   401    : 20  
##  Median :0.000    Median :0.0000   682    : 19   276    : 17   276    : 16  
##  Mean   :0.173    Mean   :0.6678   414    : 14   411    : 11   250.02 : 15  
##  3rd Qu.:0.000    3rd Qu.:1.0000   786    : 12   428    : 10   414    : 13  
##  Max.   :9.000    Max.   :9.0000   250.02 : 11   496    : 10   272    : 11  
##                                    (Other):193   (Other):200   (Other):190  
##  number_diagnoses max_glu_serum A1Cresult    insulin    change   diabetesMed
##  Min.   :3.000    >200: 69      >7  : 63   Down  : 20   Ch: 84   No :118    
##  1st Qu.:5.000    >300:124      >8  :171   No    :202   No:205   Yes:171    
##  Median :6.000    Norm: 96      Norm: 55   Steady: 53                       
##  Mean   :5.958                             Up    : 14                       
##  3rd Qu.:6.000                                                              
##  Max.   :9.000                                                              
##                                                                             
##   readmitted       
##  Length:289        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
str(diabetic_largedata)
## 'data.frame':    289 obs. of  23 variables:
##  $ race                    : Factor w/ 5 levels "AfricanAmerican",..: 3 1 3 5 3 3 4 3 3 3 ...
##  $ gender                  : Factor w/ 2 levels "Female","Male": 2 1 1 2 1 2 1 1 1 2 ...
##  $ age                     : Factor w/ 8 levels "[10-20)","[20-30)",..: 8 7 5 7 3 8 5 4 5 5 ...
##  $ admission_type_id       : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ discharge_disposition_id: int  3 1 1 6 1 1 1 1 1 10 ...
##  $ admission_source_id     : int  7 7 7 7 2 7 7 7 7 1 ...
##  $ time_in_hospital        : int  5 10 2 11 14 7 2 3 2 4 ...
##  $ num_lab_procedures      : int  47 72 61 71 43 105 66 76 43 41 ...
##  $ num_procedures          : int  1 1 0 1 0 3 0 0 0 1 ...
##  $ num_medications         : int  6 19 5 20 11 16 3 9 13 8 ...
##  $ number_outpatient       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_emergency        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_1                  : Factor w/ 92 levels "112","162","188",..: 24 5 19 87 22 36 6 21 73 76 ...
##  $ diag_2                  : Factor w/ 104 levels "162","174","197",..: 28 23 89 9 7 9 42 23 10 66 ...
##  $ diag_3                  : Factor w/ 104 levels "198","208","211",..: 46 28 9 100 66 21 19 31 93 6 ...
##  $ number_diagnoses        : int  5 5 5 5 3 5 3 5 5 3 ...
##  $ max_glu_serum           : Factor w/ 3 levels ">200",">300",..: 1 2 2 1 3 2 3 2 2 1 ...
##  $ A1Cresult               : Factor w/ 3 levels ">7",">8","Norm": 3 2 2 1 1 1 1 1 1 2 ...
##  $ insulin                 : Factor w/ 4 levels "Down","No","Steady",..: 2 4 3 2 2 2 2 2 2 2 ...
##  $ change                  : Factor w/ 2 levels "Ch","No": 2 1 2 2 2 2 2 1 2 2 ...
##  $ diabetesMed             : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 1 2 ...
##  $ readmitted              : chr  "YES" "YES" "NO" "NO" ...
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:polycor':
## 
##     polyserial
## The following objects are masked from 'package:lessR':
## 
##     reflect, rescale, scree, skew
data.frame(describe.by(diabetic_largedata))
##                          vars   n       mean         sd median    trimmed
## race*                       1 289  2.8442907  1.0103511      3  2.8712446
## gender*                     2 289  1.4186851  0.4941993      1  1.3991416
## age*                        3 289  5.7024221  1.5947051      6  5.7896996
## admission_type_id           4 289  5.0242215  1.9603982      6  5.3905579
## discharge_disposition_id    5 289  2.1972318  2.1584442      1  1.7339056
## admission_source_id         6 289  6.4878893  1.6689647      7  7.0000000
## time_in_hospital            7 289  5.3979239  3.0477899      5  5.1158798
## num_lab_procedures          8 289 64.2006920 14.4842299     63 63.6523605
## num_procedures              9 289  0.8442907  1.2388294      0  0.6480687
## num_medications            10 289 14.5397924  7.3238083     14 14.0858369
## number_outpatient          11 289  0.1591696  0.6736050      0  0.0000000
## number_emergency           12 289  0.1730104  0.9267429      0  0.0000000
## number_inpatient           13 289  0.6678201  1.3018559      0  0.3690987
## diag_1*                    14 289 43.4359862 24.1849834     39 43.0386266
## diag_2*                    15 289 45.3183391 28.9008550     46 44.1072961
## diag_3*                    16 289 42.2249135 28.5040434     38 40.4291845
## number_diagnoses           17 289  5.9584775  1.5224041      6  5.9055794
## max_glu_serum*             18 289  2.0934256  0.7511045      2  2.1158798
## A1Cresult*                 19 289  1.9723183  0.6394946      2  1.9656652
## insulin*                   20 289  2.2110727  0.6349884      2  2.1673820
## change*                    21 289  1.7093426  0.4548534      2  1.7596567
## diabetesMed*               22 289  1.5916955  0.4923726      2  1.6137339
## readmitted*                23 289  1.5986159  0.4910287      2  1.6223176
##                              mad min max range        skew    kurtosis
## race*                     0.0000   1   5     4 -0.45149674  0.12878562
## gender*                   0.0000   1   2     1  0.32793857 -1.89899132
## age*                      1.4826   1   8     7 -0.45607934 -0.18448163
## admission_type_id         0.0000   1   6     5 -1.51625238  0.33020332
## discharge_disposition_id  0.0000   1  13    12  2.08689810  4.41083501
## admission_source_id       0.0000   1   7     6 -2.93763596  6.67378573
## time_in_hospital          2.9652   1  14    13  0.79335910  0.09585215
## num_lab_procedures       14.8260  31 106    75  0.33938332 -0.22361907
## num_procedures            0.0000   0   6     6  1.30048520  0.79265021
## num_medications           7.4130   1  35    34  0.57284441  0.02853114
## number_outpatient         0.0000   0   6     6  5.50694384 34.51658080
## number_emergency          0.0000   0   9     9  6.77369511 49.50648338
## number_inpatient          0.0000   0   9     9  2.84011121 10.32269040
## diag_1*                  28.1694   1  92    91  0.13025342 -0.97016096
## diag_2*                  34.0998   1 104   103  0.20413932 -1.14550741
## diag_3*                  31.1346   1 104   103  0.43363785 -0.96011570
## number_diagnoses          1.4826   3   9     6  0.54638627  0.32225443
## max_glu_serum*            1.4826   1   3     2 -0.15330834 -1.22262668
## A1Cresult*                0.0000   1   3     2  0.02364437 -0.56572540
## insulin*                  0.0000   1   4     3  0.92913449  1.41287848
## change*                   0.0000   1   2     1 -0.91729867 -1.16254993
## diabetesMed*              0.0000   1   2     1 -0.37117505 -1.86865915
## readmitted*               0.0000   1   2     1 -0.40028082 -1.84612737
##                                  se
## race*                    0.05943242
## gender*                  0.02907055
## age*                     0.09380618
## admission_type_id        0.11531754
## discharge_disposition_id 0.12696730
## admission_source_id      0.09817439
## time_in_hospital         0.17928176
## num_lab_procedures       0.85201352
## num_procedures           0.07287232
## num_medications          0.43081225
## number_outpatient        0.03962383
## number_emergency         0.05451429
## number_inpatient         0.07657976
## diag_1*                  1.42264608
## diag_2*                  1.70005029
## diag_3*                  1.67670843
## number_diagnoses         0.08955318
## max_glu_serum*           0.04418262
## A1Cresult*               0.03761733
## insulin*                 0.03735226
## change*                  0.02675608
## diabetesMed*             0.02896310
## readmitted*              0.02888404
sapply(diabetic_largedata,function(x)sum(is.na(x)))
##                     race                   gender                      age 
##                        0                        0                        0 
##        admission_type_id discharge_disposition_id      admission_source_id 
##                        0                        0                        0 
##         time_in_hospital       num_lab_procedures           num_procedures 
##                        0                        0                        0 
##          num_medications        number_outpatient         number_emergency 
##                        0                        0                        0 
##         number_inpatient                   diag_1                   diag_2 
##                        0                        0                        0 
##                   diag_3         number_diagnoses            max_glu_serum 
##                        0                        0                        0 
##                A1Cresult                  insulin                   change 
##                        0                        0                        0 
##              diabetesMed               readmitted 
##                        0                        0
#gender age race
table(diabetic_largedata$race,diabetic_largedata$age)
##                  
##                   [10-20) [20-30) [30-40) [40-50) [50-60) [60-70) [70-80)
##   AfricanAmerican       1       2       4      10      14       8       8
##   Asian                 0       0       0       1       3       2       1
##   Caucasian             3       2       9      19      37      30      45
##   Hispanic              0       0       2       8      10      13       3
##   Other                 0       0       2       0       1       7       3
##                  
##                   [80-90)
##   AfricanAmerican       4
##   Asian                 0
##   Caucasian            35
##   Hispanic              2
##   Other                 0
ggplot(diabetic_largedata, aes(readmitted)) + 
           geom_bar(aes(age, fill = factor(readmitted)), position = 'stack')+
            scale_fill_manual(values = c("#bff5cc",  "#009f71"))+
            ylab("Readmission")+
            xlab("age")+
            ggtitle("Age Vs Readmission")

ggplot(diabetic_largedata, aes(readmitted)) + 
           geom_bar(aes(race, fill = factor(readmitted)), position = 'stack')+
            scale_fill_manual(values = c("#bea9de",  "#895ae8"))+
            ylab("Readmission")+
            xlab("Race")+
            ggtitle("Race Vs Readmission")

lapply(diabetic_largedata,function(x)table(x))
## $race
## x
## AfricanAmerican           Asian       Caucasian        Hispanic           Other 
##              51               7             180              38              13 
## 
## $gender
## x
## Female   Male 
##    168    121 
## 
## $age
## x
## [10-20) [20-30) [30-40) [40-50) [50-60) [60-70) [70-80) [80-90) 
##       4       4      17      38      65      60      60      41 
## 
## $admission_type_id
## x
##   1   2   3   6 
##  52   4   2 231 
## 
## $discharge_disposition_id
## x
##   1   2   3   5   6   7  10  11  13 
## 190  20  33   5  27   9   1   3   1 
## 
## $admission_source_id
## x
##   1   2   7 
##  23   2 264 
## 
## $time_in_hospital
## x
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 
## 16 35 33 51 34 34 22 15 15 14  5  7  3  5 
## 
## $num_lab_procedures
## x
##  31  32  36  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   1   1   1   2   2   2   4   2   6   2   4   5   2   4   9   5   7   7   6   8 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74 
##   6   8   8  10   5   7   7   6   9   6   9   5   8   8   7   6   6   9   4   5 
##  75  76  77  78  79  80  81  82  83  84  85  86  87  88  90  91  93  94  95  96 
##   6   5   8   4   6   5   4   2   2   4   2   1   2   3   1   2   2   1   2   2 
##  97  98 102 105 106 
##   1   2   1   1   1 
## 
## $num_procedures
## x
##   0   1   2   3   4   5   6 
## 175  42  23  44   2   2   1 
## 
## $num_medications
## x
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  2  3 10  5  8 14  7 11 15  9 24 14 21 16 10 21 13  4 19 10  8  5  4  3  6  7 
## 27 28 29 30 31 32 33 34 35 
##  3  4  2  1  1  2  1  4  2 
## 
## $number_outpatient
## x
##   0   1   2   3   4   5   6 
## 266  13   3   4   1   1   1 
## 
## $number_emergency
## x
##   0   1   2   3   5   6   7   9 
## 272   8   3   1   1   2   1   1 
## 
## $number_inpatient
## x
##   0   1   2   3   4   5   6   7   9 
## 197  42  25  15   4   1   2   2   1 
## 
## $diag_1
## x
##    112    162    188    250 250.02 250.03  250.1 250.11 250.12 250.13 250.22 
##      1      2      1      1     11      3      2      2      4      7      2 
##  250.6  250.7  250.8 250.81 250.82 250.83    253    276    280    295    296 
##      6      1      7      2      3      1      1      2      1      1      2 
##    298    332    340    376     38    401    402    403    410    411    414 
##      1      2      1      1      8      4      1      1      5      1     14 
##    415    427    428    433    434    435    436    437    443    444    451 
##      1      4     19      2      9      8      1      1      2      1      1 
##    453    458    486    491    493    507    515    518     53    531    535 
##      3      1      9     21      9      1      1      2      1      2      2 
##    537    542    558    560    562    564    566    569    571    574    577 
##      1      0      3      1      3      1      0      2      1      2      4 
##    578    584    590    596    599    681    682    707    714    715    722 
##      2      2      4      1      4      1     19      2      1      2      2 
##    730    733    780    784    785    786     79      8    807    820    965 
##      1      1      1      1      1     12      1      1      2      1      1 
##    969    996    V57    V58 
##      1      2      1      1 
## 
## $diag_2
## x
##    162    174    197    211    218    244    250 250.01 250.02 250.03 250.11 
##      1      1      1      1      1      1     22      7     19      3      1 
## 250.12 250.13  250.4 250.42 250.43  250.6  250.7  250.8 250.82 250.83    272 
##      1      1      2      1      1      5      1      1      3      2      3 
##    276    278    280    285    286    294    295    300    305    327    331 
##     17      1      1      2      2      5      1      1      1      2      1 
##    337    348    349    357    358     38    382    386    401    402    403 
##      1      1      1      2      1      3      1      1      9      2      5 
##     41    410    411    413    414    415    424    425    427    428    433 
##      4      2     11      2      7      1      1      4      4     10      2 
##    435    437    441    443    455    466    473    482    486    491    493 
##      1      1      1      2      1      1      1      2      3      7      2 
##    496    511    515    518    530    532    535    536    569    571    577 
##     10      2      2      2      1      1      2      1      1      1      1 
##    584    585    590    599    607    682    707    716    724    730    758 
##      2      1      1     10      1      3      9      1      1      1      1 
##    780    781    785    786    787    789    790    792    799      8    995 
##      3      2      1      2      1      1      1      1      1      1      6 
##    998   E888   E906   E980    V58 
##      0      1      1      1      1 
## 
## $diag_3
## x
##    198    208    211    216    238    250 250.01 250.02 250.03 250.12  250.4 
##      1      1      1      1      1     24      3     15      6      1      1 
## 250.41 250.42 250.43  250.5 250.53  250.6  250.8    272    275    276    278 
##      1      1      1      1      1      4      3     11      1     16      4 
##    280    285    287    288    293    294    295    296    300    303    327 
##      4      4      1      3      1      2      2      1      1      2      2 
##    332    345    357    381    401    402    403     41    413    414    416 
##      1      1      3      1     20      1      3      4      1     13      1 
##    424    425    426    427    428    433    435    443    446    453    458 
##      2      5      1      5      9      2      1      2      0      1      1 
##    466    486    493    496      5    511    518    530    535    536    564 
##      1      2      2      3      1      1      4      2      3      1      1 
##    571    572    574    575    581    583    584    585    592    593    599 
##      1      1      1      1      2      1      2      3      1      1      7 
##    625    681    682    707    724    733    737    780    783    785    786 
##      1      1      5      5      1      2      1      4      1      4      1 
##    790    794    799    826    891    920    945    962    995   E849   E880 
##      1      2      2      1      1      1      1      1      3      1      1 
##   E885   E932   E950    V12    V58 
##      1      1      1      1      2 
## 
## $number_diagnoses
## x
##   3   4   5   6   7   8   9 
##  18  13  67 142   3   6  40 
## 
## $max_glu_serum
## x
## >200 >300 Norm 
##   69  124   96 
## 
## $A1Cresult
## x
##   >7   >8 Norm 
##   63  171   55 
## 
## $insulin
## x
##   Down     No Steady     Up 
##     20    202     53     14 
## 
## $change
## x
##  Ch  No 
##  84 205 
## 
## $diabetesMed
## x
##  No Yes 
## 118 171 
## 
## $readmitted
## x
##  NO YES 
## 116 173

###Data Visualization

#when you have categorical and continues column you can come up with box plot
library(ggplot2)
ggplot(diabetic_largedata, aes_string(x="readmitted",y=diabetic_largedata$num_lab_procedures)) + geom_boxplot(aes(fill=readmitted))+ggtitle("num_lab_procedures grouped by readmitted")

library(ggbeeswarm)
ggplot(diabetic_largedata, aes_string(x="readmitted",y=diabetic_largedata$num_lab_procedures)) + geom_boxplot(aes(fill=readmitted,color=readmitted)) + geom_quasirandom(alpha = 0.3)

#Ploting boxplot for all continues variable

continuous_vars <- names(diabetic_largedata)[sapply(diabetic_largedata, is.numeric)]
plots <- list()
for (var in continuous_vars) {
  p<-ggplot(diabetic_largedata, aes(x = factor(readmitted), y = .data[[var]], fill = factor(readmitted))) +
    geom_boxplot() +
    labs(title = paste("Boxplot of", var)) +
    theme_bw() +
    theme(plot.title = element_text(hjust = 0.5))
  plots[[var]] <- p
}

print(plots)
## $admission_type_id

## 
## $discharge_disposition_id

## 
## $admission_source_id

## 
## $time_in_hospital

## 
## $num_lab_procedures

## 
## $num_procedures

## 
## $num_medications

## 
## $number_outpatient

## 
## $number_emergency

## 
## $number_inpatient

## 
## $number_diagnoses

continuous_vars <- names(diabetic_largedata)[sapply(diabetic_largedata, is.numeric)]

# create a long format dataset with continuous_vars and readmitted
df <- reshape2::melt(diabetic_largedata[, c(continuous_vars, "readmitted")], id.vars = "readmitted")

# create boxplots with outcome variable as fill color and facet by continuous variable
p <- ggplot(df, aes(x = factor(readmitted), y = value, fill = factor(readmitted))) +
  geom_boxplot() +
  facet_wrap(~variable, scales = "free_y") +
  theme_bw()

print(p)

vars <- names(diabetic_largedata)
plots <- list()

for (var in vars) {
  if(is.numeric(diabetic_largedata[[var]])) {
q1<-quantile(diabetic_largedata[[var]], 0.25)
q3<-quantile(diabetic_largedata[[var]], 0.75)
p <- ggplot(diabetic_largedata, aes(x = .data[[var]])) +
      geom_histogram(binwidth = 0.3, col = "black", fill = "#8db700") +
      geom_vline(xintercept = q1, col = "red", lwd = 2) +
      geom_vline(xintercept = q3, col = "red", lwd = 2) +
      labs(title = paste("Histogram of", var)) +
    theme_bw() +
    theme(plot.title = element_text(hjust = 0.5))
  
  plots[[var]]<-p
  }else if (is.factor(diabetic_largedata[[var]])) {
    p<- ggplot(diabetic_largedata, aes(x = .data[[var]], fill = readmitted)) +
      geom_bar(position = "dodge") +
      labs(title = paste("Barplot of", var)) +
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5))
    
    plots[[var]] <- p
  }
}


print(plots)
## $race

## 
## $gender

## 
## $age

## 
## $admission_type_id

## 
## $discharge_disposition_id

## 
## $admission_source_id

## 
## $time_in_hospital

## 
## $num_lab_procedures

## 
## $num_procedures

## 
## $num_medications

## 
## $number_outpatient

## 
## $number_emergency

## 
## $number_inpatient

## 
## $diag_1

## 
## $diag_2

## 
## $diag_3

## 
## $number_diagnoses

## 
## $max_glu_serum

## 
## $A1Cresult

## 
## $insulin

## 
## $change

## 
## $diabetesMed

vars <- names(diabetic_largedata)
plots <- list()

for (i in 1:(length(vars)-1)) {
  for (j in (i+1):length(vars)) {
    if(is.numeric(diabetic_largedata[[vars[i]]]) && is.numeric(diabetic_largedata[[vars[j]]])) {
      p <- ggplot(diabetic_largedata, aes(x = .data[[vars[i]]], y = .data[[vars[j]]], color = factor(readmitted))) +
        geom_point() +
        labs(title = paste("Scatterplot of", vars[i], "vs.", vars[j])) +
        theme_bw() +
        theme(plot.title = element_text(hjust = 0.5))
      
      plots[[paste(vars[i], vars[j], sep="_")]] <- p
    }
  }
}


print(plots)
## $admission_type_id_discharge_disposition_id

## 
## $admission_type_id_admission_source_id

## 
## $admission_type_id_time_in_hospital

## 
## $admission_type_id_num_lab_procedures

## 
## $admission_type_id_num_procedures

## 
## $admission_type_id_num_medications

## 
## $admission_type_id_number_outpatient

## 
## $admission_type_id_number_emergency

## 
## $admission_type_id_number_inpatient

## 
## $admission_type_id_number_diagnoses

## 
## $discharge_disposition_id_admission_source_id

## 
## $discharge_disposition_id_time_in_hospital

## 
## $discharge_disposition_id_num_lab_procedures

## 
## $discharge_disposition_id_num_procedures

## 
## $discharge_disposition_id_num_medications

## 
## $discharge_disposition_id_number_outpatient

## 
## $discharge_disposition_id_number_emergency

## 
## $discharge_disposition_id_number_inpatient

## 
## $discharge_disposition_id_number_diagnoses

## 
## $admission_source_id_time_in_hospital

## 
## $admission_source_id_num_lab_procedures

## 
## $admission_source_id_num_procedures

## 
## $admission_source_id_num_medications

## 
## $admission_source_id_number_outpatient

## 
## $admission_source_id_number_emergency

## 
## $admission_source_id_number_inpatient

## 
## $admission_source_id_number_diagnoses

## 
## $time_in_hospital_num_lab_procedures

## 
## $time_in_hospital_num_procedures

## 
## $time_in_hospital_num_medications

## 
## $time_in_hospital_number_outpatient

## 
## $time_in_hospital_number_emergency

## 
## $time_in_hospital_number_inpatient

## 
## $time_in_hospital_number_diagnoses

## 
## $num_lab_procedures_num_procedures

## 
## $num_lab_procedures_num_medications

## 
## $num_lab_procedures_number_outpatient

## 
## $num_lab_procedures_number_emergency

## 
## $num_lab_procedures_number_inpatient

## 
## $num_lab_procedures_number_diagnoses

## 
## $num_procedures_num_medications

## 
## $num_procedures_number_outpatient

## 
## $num_procedures_number_emergency

## 
## $num_procedures_number_inpatient

## 
## $num_procedures_number_diagnoses

## 
## $num_medications_number_outpatient

## 
## $num_medications_number_emergency

## 
## $num_medications_number_inpatient

## 
## $num_medications_number_diagnoses

## 
## $number_outpatient_number_emergency

## 
## $number_outpatient_number_inpatient

## 
## $number_outpatient_number_diagnoses

## 
## $number_emergency_number_inpatient

## 
## $number_emergency_number_diagnoses

## 
## $number_inpatient_number_diagnoses

#Diag bocketting :

str(diabetic_largedata)
## 'data.frame':    289 obs. of  23 variables:
##  $ race                    : Factor w/ 5 levels "AfricanAmerican",..: 3 1 3 5 3 3 4 3 3 3 ...
##  $ gender                  : Factor w/ 2 levels "Female","Male": 2 1 1 2 1 2 1 1 1 2 ...
##  $ age                     : Factor w/ 8 levels "[10-20)","[20-30)",..: 8 7 5 7 3 8 5 4 5 5 ...
##  $ admission_type_id       : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ discharge_disposition_id: int  3 1 1 6 1 1 1 1 1 10 ...
##  $ admission_source_id     : int  7 7 7 7 2 7 7 7 7 1 ...
##  $ time_in_hospital        : int  5 10 2 11 14 7 2 3 2 4 ...
##  $ num_lab_procedures      : int  47 72 61 71 43 105 66 76 43 41 ...
##  $ num_procedures          : int  1 1 0 1 0 3 0 0 0 1 ...
##  $ num_medications         : int  6 19 5 20 11 16 3 9 13 8 ...
##  $ number_outpatient       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_emergency        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_1                  : Factor w/ 92 levels "112","162","188",..: 24 5 19 87 22 36 6 21 73 76 ...
##  $ diag_2                  : Factor w/ 104 levels "162","174","197",..: 28 23 89 9 7 9 42 23 10 66 ...
##  $ diag_3                  : Factor w/ 104 levels "198","208","211",..: 46 28 9 100 66 21 19 31 93 6 ...
##  $ number_diagnoses        : int  5 5 5 5 3 5 3 5 5 3 ...
##  $ max_glu_serum           : Factor w/ 3 levels ">200",">300",..: 1 2 2 1 3 2 3 2 2 1 ...
##  $ A1Cresult               : Factor w/ 3 levels ">7",">8","Norm": 3 2 2 1 1 1 1 1 1 2 ...
##  $ insulin                 : Factor w/ 4 levels "Down","No","Steady",..: 2 4 3 2 2 2 2 2 2 2 ...
##  $ change                  : Factor w/ 2 levels "Ch","No": 2 1 2 2 2 2 2 1 2 2 ...
##  $ diabetesMed             : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 1 2 ...
##  $ readmitted              : chr  "YES" "YES" "NO" "NO" ...
# If any of the diagnosis codes falls in the range of 390-459 or is equal to 785, the row is assigned to the "diag_circ" category. If the code falls in the range of 460-519 or is equal to 786, the row is assigned to the "diag_resp" category. If the code falls in the range of 520-579 or is equal to 787, the row is assigned to the "diag_dig" category. If the code falls  is greater than 251 but less than 249, the row is assigned to the "diag_diab" category. If the code falls in the range of 290-319 or is equal to 780 or 781, the row is assigned to the "diag_ment" category. If the code falls in the range of 800-999, the row is assigned to the "diag_inj" category. If the code falls in the range of 710-739 or is equal to 736, the row is assigned to the "diag_musc" category. If the code falls in the range of 580-629 or is equal to 788, the row is assigned to the "diag_geni" category. If the code falls in the range of 140-239, the row is assigned to the "diag_neop" category. If the code doesn't fall into any of these ranges, the row is assigned to the "diag_other" category.

diabetic_largedata$diag_circ <- 0
diabetic_largedata$diag_resp <- 0
diabetic_largedata$diag_dig <- 0
diabetic_largedata$diag_diab <- 0
diabetic_largedata$diag_inj <- 0
diabetic_largedata$diag_musc <-0
diabetic_largedata$diag_geni <- 0
diabetic_largedata$diag_neop <-0
diabetic_largedata$diag_other <- 0

# Loop through each row of the data frame
for (i in 1:nrow(diabetic_largedata)) {

  # Check each diagnosis code in the row and assign to the appropriate category
  for (j in 1:3) {
    code <- as.character(diabetic_largedata[i, paste0("diag_", j)])
    
    if (code >= "390" & code <= "459" | code == "785") {
      diabetic_largedata[i, "diag_circ"] <- 1
    } else if (code > "249" & code < "251") {
      diabetic_largedata[i, "diag_diab"] <- 1
    } else if (code >= "460" & code <= "519" | code == "786") {
      diabetic_largedata[i, "diag_resp"] <- 1
    } else if (code >= "520" & code <= "579" | code == "787") {
      diabetic_largedata[i, "diag_dig"] <- 1
    } else if (code >= "800" & code <= "999") {
      diabetic_largedata[i, "diag_inj"] <- 1
    } else if (code >= "710" & code <= "739") {
      diabetic_largedata[i, "diag_musc"] <- 1
    } else if (code >= "580" & code <= "629" | code == "788") {
      diabetic_largedata[i, "diag_geni"] <- 1
    } else if (code >= "140" & code <= "239" ){
       diabetic_largedata[i, "diag_neop"] <- 1
    } else (diabetic_largedata$diag_other <- as.numeric(apply(diabetic_largedata[, paste0("diag_", 1:3)], 1, function(x) any(x %in% c("780", "781", "784", paste0(790:799), paste0(240:249), paste0(251:279), paste0(680:709), "782", paste0(1:139), paste0(290:319), paste0(280:289), paste0(320:359), paste0(630:679), paste0(360:389), paste0(740:759), paste0("E", 0:9), paste0("V", 0:9))))))
  }
}
str(diabetic_largedata)
## 'data.frame':    289 obs. of  32 variables:
##  $ race                    : Factor w/ 5 levels "AfricanAmerican",..: 3 1 3 5 3 3 4 3 3 3 ...
##  $ gender                  : Factor w/ 2 levels "Female","Male": 2 1 1 2 1 2 1 1 1 2 ...
##  $ age                     : Factor w/ 8 levels "[10-20)","[20-30)",..: 8 7 5 7 3 8 5 4 5 5 ...
##  $ admission_type_id       : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ discharge_disposition_id: int  3 1 1 6 1 1 1 1 1 10 ...
##  $ admission_source_id     : int  7 7 7 7 2 7 7 7 7 1 ...
##  $ time_in_hospital        : int  5 10 2 11 14 7 2 3 2 4 ...
##  $ num_lab_procedures      : int  47 72 61 71 43 105 66 76 43 41 ...
##  $ num_procedures          : int  1 1 0 1 0 3 0 0 0 1 ...
##  $ num_medications         : int  6 19 5 20 11 16 3 9 13 8 ...
##  $ number_outpatient       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_emergency        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_1                  : Factor w/ 92 levels "112","162","188",..: 24 5 19 87 22 36 6 21 73 76 ...
##  $ diag_2                  : Factor w/ 104 levels "162","174","197",..: 28 23 89 9 7 9 42 23 10 66 ...
##  $ diag_3                  : Factor w/ 104 levels "198","208","211",..: 46 28 9 100 66 21 19 31 93 6 ...
##  $ number_diagnoses        : int  5 5 5 5 3 5 3 5 5 3 ...
##  $ max_glu_serum           : Factor w/ 3 levels ">200",">300",..: 1 2 2 1 3 2 3 2 2 1 ...
##  $ A1Cresult               : Factor w/ 3 levels ">7",">8","Norm": 3 2 2 1 1 1 1 1 1 2 ...
##  $ insulin                 : Factor w/ 4 levels "Down","No","Steady",..: 2 4 3 2 2 2 2 2 2 2 ...
##  $ change                  : Factor w/ 2 levels "Ch","No": 2 1 2 2 2 2 2 1 2 2 ...
##  $ diabetesMed             : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 1 2 ...
##  $ readmitted              : chr  "YES" "YES" "NO" "NO" ...
##  $ diag_circ               : num  1 0 0 0 0 1 1 0 0 0 ...
##  $ diag_resp               : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ diag_dig                : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ diag_diab               : num  0 1 1 1 1 1 1 0 1 1 ...
##  $ diag_inj                : num  0 0 0 1 0 0 0 0 1 0 ...
##  $ diag_musc               : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ diag_geni               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_neop               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_other              : num  1 1 1 0 1 1 1 1 1 0 ...
library(ggplot2)
diag_subset <- diabetic_largedata[, 23:32]
diag_long <- reshape2::melt(diag_subset)
## Using readmitted as id variables
ggplot(diag_long, aes(x = variable, y = value, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  stat_summary(aes(label = ..y..), fun = sum, geom = "text", position = position_dodge(width = 0.9), vjust = -0.5) +
  labs(x = "Diagnosis", y = "Frequency") +
  theme_minimal() +
  theme(legend.position = "none")

#Ploting categorical variable #I haven’t add the new diags and remove the diag1,2,3 from the data set I’ll add later

categorical_vars <- names(diabetic_largedata)[sapply(diabetic_largedata, is.factor)]
plots <- list()

for (var in categorical_vars) {
  p <- ggplot(diabetic_largedata, aes(x = factor(readmitted), fill = .data[[var]])) +
    geom_bar(position = "fill") +
    labs(title = paste("Barplot of", var)) +
    theme_bw() +
    theme(plot.title = element_text(hjust = 0.5))
  
  plots[[var]] <- p
}

plots
## $race

## 
## $gender

## 
## $age

## 
## $diag_1

## 
## $diag_2

## 
## $diag_3

## 
## $max_glu_serum

## 
## $A1Cresult

## 
## $insulin

## 
## $change

## 
## $diabetesMed

#Dummy

library(caret)
## Loading required package: lattice
diabetic_largedata$discharge_disposition_id <- as.factor(diabetic_largedata$discharge_disposition_id)
diabetic_largedata$admission_type_id <- as.factor(diabetic_largedata$admission_type_id)
diabetic_largedata$admission_source_id <- as.factor(diabetic_largedata$admission_source_id)
dummy1 <- predict(dummyVars(~race+gender+age+discharge_disposition_id + admission_type_id + max_glu_serum + A1Cresult + admission_source_id+insulin+change+diabetesMed, data = diabetic_largedata), newdata = diabetic_largedata)
head(dummy1)
##     race.AfricanAmerican race.Asian race.Caucasian race.Hispanic race.Other
## 163                    0          0              1             0          0
## 461                    1          0              0             0          0
## 594                    0          0              1             0          0
## 697                    0          0              0             0          1
## 772                    0          0              1             0          0
## 824                    0          0              1             0          0
##     gender.Female gender.Male age.[10-20) age.[20-30) age.[30-40) age.[40-50)
## 163             0           1           0           0           0           0
## 461             1           0           0           0           0           0
## 594             1           0           0           0           0           0
## 697             0           1           0           0           0           0
## 772             1           0           0           0           1           0
## 824             0           1           0           0           0           0
##     age.[50-60) age.[60-70) age.[70-80) age.[80-90) discharge_disposition_id.1
## 163           0           0           0           1                          0
## 461           0           0           1           0                          1
## 594           1           0           0           0                          1
## 697           0           0           1           0                          0
## 772           0           0           0           0                          1
## 824           0           0           0           1                          1
##     discharge_disposition_id.2 discharge_disposition_id.3
## 163                          0                          1
## 461                          0                          0
## 594                          0                          0
## 697                          0                          0
## 772                          0                          0
## 824                          0                          0
##     discharge_disposition_id.5 discharge_disposition_id.6
## 163                          0                          0
## 461                          0                          0
## 594                          0                          0
## 697                          0                          1
## 772                          0                          0
## 824                          0                          0
##     discharge_disposition_id.7 discharge_disposition_id.10
## 163                          0                           0
## 461                          0                           0
## 594                          0                           0
## 697                          0                           0
## 772                          0                           0
## 824                          0                           0
##     discharge_disposition_id.11 discharge_disposition_id.13 admission_type_id.1
## 163                           0                           0                   0
## 461                           0                           0                   0
## 594                           0                           0                   0
## 697                           0                           0                   0
## 772                           0                           0                   0
## 824                           0                           0                   0
##     admission_type_id.2 admission_type_id.3 admission_type_id.6
## 163                   0                   0                   1
## 461                   0                   0                   1
## 594                   0                   0                   1
## 697                   0                   0                   1
## 772                   0                   0                   1
## 824                   0                   0                   1
##     max_glu_serum.>200 max_glu_serum.>300 max_glu_serum.Norm A1Cresult.>7
## 163                  1                  0                  0            0
## 461                  0                  1                  0            0
## 594                  0                  1                  0            0
## 697                  1                  0                  0            1
## 772                  0                  0                  1            1
## 824                  0                  1                  0            1
##     A1Cresult.>8 A1Cresult.Norm admission_source_id.1 admission_source_id.2
## 163            0              1                     0                     0
## 461            1              0                     0                     0
## 594            1              0                     0                     0
## 697            0              0                     0                     0
## 772            0              0                     0                     1
## 824            0              0                     0                     0
##     admission_source_id.7 insulin.Down insulin.No insulin.Steady insulin.Up
## 163                     1            0          1              0          0
## 461                     1            0          0              0          1
## 594                     1            0          0              1          0
## 697                     1            0          1              0          0
## 772                     0            0          1              0          0
## 824                     1            0          1              0          0
##     change.Ch change.No diabetesMed.No diabetesMed.Yes
## 163         0         1              1               0
## 461         1         0              0               1
## 594         0         1              0               1
## 697         0         1              0               1
## 772         0         1              1               0
## 824         0         1              0               1
colnames(dummy1)
##  [1] "race.AfricanAmerican"        "race.Asian"                 
##  [3] "race.Caucasian"              "race.Hispanic"              
##  [5] "race.Other"                  "gender.Female"              
##  [7] "gender.Male"                 "age.[10-20)"                
##  [9] "age.[20-30)"                 "age.[30-40)"                
## [11] "age.[40-50)"                 "age.[50-60)"                
## [13] "age.[60-70)"                 "age.[70-80)"                
## [15] "age.[80-90)"                 "discharge_disposition_id.1" 
## [17] "discharge_disposition_id.2"  "discharge_disposition_id.3" 
## [19] "discharge_disposition_id.5"  "discharge_disposition_id.6" 
## [21] "discharge_disposition_id.7"  "discharge_disposition_id.10"
## [23] "discharge_disposition_id.11" "discharge_disposition_id.13"
## [25] "admission_type_id.1"         "admission_type_id.2"        
## [27] "admission_type_id.3"         "admission_type_id.6"        
## [29] "max_glu_serum.>200"          "max_glu_serum.>300"         
## [31] "max_glu_serum.Norm"          "A1Cresult.>7"               
## [33] "A1Cresult.>8"                "A1Cresult.Norm"             
## [35] "admission_source_id.1"       "admission_source_id.2"      
## [37] "admission_source_id.7"       "insulin.Down"               
## [39] "insulin.No"                  "insulin.Steady"             
## [41] "insulin.Up"                  "change.Ch"                  
## [43] "change.No"                   "diabetesMed.No"             
## [45] "diabetesMed.Yes"
colnames(dummy1)[colnames(dummy1) == "A1Cresult.>7"] <- "A1Cresult7"
colnames(dummy1)[colnames(dummy1) == "A1Cresult.>8"] <- "A1Cresult8" 
colnames(dummy1)[colnames(dummy1) == "max_glu_serum.>300"] <- "max_glu_serum300" 
colnames(dummy1)[colnames(dummy1) == "max_glu_serum.>200"] <- "max_glu_serum200"
colnames(dummy1)[colnames(dummy1) == "age.[10-20)"] <- "agefirst"
colnames(dummy1)[colnames(dummy1) == "age.[20-30)"] <- "agesecond"
colnames(dummy1)[colnames(dummy1) == "age.[30-40)"] <- "agethird"
colnames(dummy1)[colnames(dummy1) == "age.[40-50)"] <- "ageforth"
colnames(dummy1)[colnames(dummy1) == "age.[50-60)"] <- "agefifth"
colnames(dummy1)[colnames(dummy1) == "age.[60-70)"] <- "agesixth"
colnames(dummy1)[colnames(dummy1) == "age.[70-80)"] <- "ageseventh"
colnames(dummy1)[colnames(dummy1) == "age.[80-90)"] <- "ageeighth"
colnames(dummy1)
##  [1] "race.AfricanAmerican"        "race.Asian"                 
##  [3] "race.Caucasian"              "race.Hispanic"              
##  [5] "race.Other"                  "gender.Female"              
##  [7] "gender.Male"                 "agefirst"                   
##  [9] "agesecond"                   "agethird"                   
## [11] "ageforth"                    "agefifth"                   
## [13] "agesixth"                    "ageseventh"                 
## [15] "ageeighth"                   "discharge_disposition_id.1" 
## [17] "discharge_disposition_id.2"  "discharge_disposition_id.3" 
## [19] "discharge_disposition_id.5"  "discharge_disposition_id.6" 
## [21] "discharge_disposition_id.7"  "discharge_disposition_id.10"
## [23] "discharge_disposition_id.11" "discharge_disposition_id.13"
## [25] "admission_type_id.1"         "admission_type_id.2"        
## [27] "admission_type_id.3"         "admission_type_id.6"        
## [29] "max_glu_serum200"            "max_glu_serum300"           
## [31] "max_glu_serum.Norm"          "A1Cresult7"                 
## [33] "A1Cresult8"                  "A1Cresult.Norm"             
## [35] "admission_source_id.1"       "admission_source_id.2"      
## [37] "admission_source_id.7"       "insulin.Down"               
## [39] "insulin.No"                  "insulin.Steady"             
## [41] "insulin.Up"                  "change.Ch"                  
## [43] "change.No"                   "diabetesMed.No"             
## [45] "diabetesMed.Yes"
cbind(colnames(diabetic_largedata))
##       [,1]                      
##  [1,] "race"                    
##  [2,] "gender"                  
##  [3,] "age"                     
##  [4,] "admission_type_id"       
##  [5,] "discharge_disposition_id"
##  [6,] "admission_source_id"     
##  [7,] "time_in_hospital"        
##  [8,] "num_lab_procedures"      
##  [9,] "num_procedures"          
## [10,] "num_medications"         
## [11,] "number_outpatient"       
## [12,] "number_emergency"        
## [13,] "number_inpatient"        
## [14,] "diag_1"                  
## [15,] "diag_2"                  
## [16,] "diag_3"                  
## [17,] "number_diagnoses"        
## [18,] "max_glu_serum"           
## [19,] "A1Cresult"               
## [20,] "insulin"                 
## [21,] "change"                  
## [22,] "diabetesMed"             
## [23,] "readmitted"              
## [24,] "diag_circ"               
## [25,] "diag_resp"               
## [26,] "diag_dig"                
## [27,] "diag_diab"               
## [28,] "diag_inj"                
## [29,] "diag_musc"               
## [30,] "diag_geni"               
## [31,] "diag_neop"               
## [32,] "diag_other"
diabetic_largedata<-cbind(diabetic_largedata[,-c(1:6 ,14:16 ,18:22 )],dummy1)
head(diabetic_largedata)
##     time_in_hospital num_lab_procedures num_procedures num_medications
## 163                5                 47              1               6
## 461               10                 72              1              19
## 594                2                 61              0               5
## 697               11                 71              1              20
## 772               14                 43              0              11
## 824                7                105              3              16
##     number_outpatient number_emergency number_inpatient number_diagnoses
## 163                 0                0                0                5
## 461                 0                0                0                5
## 594                 0                0                0                5
## 697                 0                0                0                5
## 772                 0                0                0                3
## 824                 0                0                0                5
##     readmitted diag_circ diag_resp diag_dig diag_diab diag_inj diag_musc
## 163        YES         1         0        0         0        0         0
## 461        YES         0         0        0         1        0         0
## 594         NO         0         0        0         1        0         0
## 697         NO         0         0        0         1        1         0
## 772        YES         0         0        1         1        0         0
## 824        YES         1         0        0         1        0         0
##     diag_geni diag_neop diag_other race.AfricanAmerican race.Asian
## 163         0         0          1                    0          0
## 461         0         0          1                    1          0
## 594         0         0          1                    0          0
## 697         0         0          0                    0          0
## 772         0         0          1                    0          0
## 824         0         0          1                    0          0
##     race.Caucasian race.Hispanic race.Other gender.Female gender.Male agefirst
## 163              1             0          0             0           1        0
## 461              0             0          0             1           0        0
## 594              1             0          0             1           0        0
## 697              0             0          1             0           1        0
## 772              1             0          0             1           0        0
## 824              1             0          0             0           1        0
##     agesecond agethird ageforth agefifth agesixth ageseventh ageeighth
## 163         0        0        0        0        0          0         1
## 461         0        0        0        0        0          1         0
## 594         0        0        0        1        0          0         0
## 697         0        0        0        0        0          1         0
## 772         0        1        0        0        0          0         0
## 824         0        0        0        0        0          0         1
##     discharge_disposition_id.1 discharge_disposition_id.2
## 163                          0                          0
## 461                          1                          0
## 594                          1                          0
## 697                          0                          0
## 772                          1                          0
## 824                          1                          0
##     discharge_disposition_id.3 discharge_disposition_id.5
## 163                          1                          0
## 461                          0                          0
## 594                          0                          0
## 697                          0                          0
## 772                          0                          0
## 824                          0                          0
##     discharge_disposition_id.6 discharge_disposition_id.7
## 163                          0                          0
## 461                          0                          0
## 594                          0                          0
## 697                          1                          0
## 772                          0                          0
## 824                          0                          0
##     discharge_disposition_id.10 discharge_disposition_id.11
## 163                           0                           0
## 461                           0                           0
## 594                           0                           0
## 697                           0                           0
## 772                           0                           0
## 824                           0                           0
##     discharge_disposition_id.13 admission_type_id.1 admission_type_id.2
## 163                           0                   0                   0
## 461                           0                   0                   0
## 594                           0                   0                   0
## 697                           0                   0                   0
## 772                           0                   0                   0
## 824                           0                   0                   0
##     admission_type_id.3 admission_type_id.6 max_glu_serum200 max_glu_serum300
## 163                   0                   1                1                0
## 461                   0                   1                0                1
## 594                   0                   1                0                1
## 697                   0                   1                1                0
## 772                   0                   1                0                0
## 824                   0                   1                0                1
##     max_glu_serum.Norm A1Cresult7 A1Cresult8 A1Cresult.Norm
## 163                  0          0          0              1
## 461                  0          0          1              0
## 594                  0          0          1              0
## 697                  0          1          0              0
## 772                  1          1          0              0
## 824                  0          1          0              0
##     admission_source_id.1 admission_source_id.2 admission_source_id.7
## 163                     0                     0                     1
## 461                     0                     0                     1
## 594                     0                     0                     1
## 697                     0                     0                     1
## 772                     0                     1                     0
## 824                     0                     0                     1
##     insulin.Down insulin.No insulin.Steady insulin.Up change.Ch change.No
## 163            0          1              0          0         0         1
## 461            0          0              0          1         1         0
## 594            0          0              1          0         0         1
## 697            0          1              0          0         0         1
## 772            0          1              0          0         0         1
## 824            0          1              0          0         0         1
##     diabetesMed.No diabetesMed.Yes
## 163              1               0
## 461              0               1
## 594              0               1
## 697              0               1
## 772              1               0
## 824              0               1
#replacing "." with empthy variable
colnames(diabetic_largedata) <- gsub("\\.", "", colnames(diabetic_largedata))
cbind(colnames(diabetic_largedata))
##       [,1]                        
##  [1,] "time_in_hospital"          
##  [2,] "num_lab_procedures"        
##  [3,] "num_procedures"            
##  [4,] "num_medications"           
##  [5,] "number_outpatient"         
##  [6,] "number_emergency"          
##  [7,] "number_inpatient"          
##  [8,] "number_diagnoses"          
##  [9,] "readmitted"                
## [10,] "diag_circ"                 
## [11,] "diag_resp"                 
## [12,] "diag_dig"                  
## [13,] "diag_diab"                 
## [14,] "diag_inj"                  
## [15,] "diag_musc"                 
## [16,] "diag_geni"                 
## [17,] "diag_neop"                 
## [18,] "diag_other"                
## [19,] "raceAfricanAmerican"       
## [20,] "raceAsian"                 
## [21,] "raceCaucasian"             
## [22,] "raceHispanic"              
## [23,] "raceOther"                 
## [24,] "genderFemale"              
## [25,] "genderMale"                
## [26,] "agefirst"                  
## [27,] "agesecond"                 
## [28,] "agethird"                  
## [29,] "ageforth"                  
## [30,] "agefifth"                  
## [31,] "agesixth"                  
## [32,] "ageseventh"                
## [33,] "ageeighth"                 
## [34,] "discharge_disposition_id1" 
## [35,] "discharge_disposition_id2" 
## [36,] "discharge_disposition_id3" 
## [37,] "discharge_disposition_id5" 
## [38,] "discharge_disposition_id6" 
## [39,] "discharge_disposition_id7" 
## [40,] "discharge_disposition_id10"
## [41,] "discharge_disposition_id11"
## [42,] "discharge_disposition_id13"
## [43,] "admission_type_id1"        
## [44,] "admission_type_id2"        
## [45,] "admission_type_id3"        
## [46,] "admission_type_id6"        
## [47,] "max_glu_serum200"          
## [48,] "max_glu_serum300"          
## [49,] "max_glu_serumNorm"         
## [50,] "A1Cresult7"                
## [51,] "A1Cresult8"                
## [52,] "A1CresultNorm"             
## [53,] "admission_source_id1"      
## [54,] "admission_source_id2"      
## [55,] "admission_source_id7"      
## [56,] "insulinDown"               
## [57,] "insulinNo"                 
## [58,] "insulinSteady"             
## [59,] "insulinUp"                 
## [60,] "changeCh"                  
## [61,] "changeNo"                  
## [62,] "diabetesMedNo"             
## [63,] "diabetesMedYes"
summary(diabetic_largedata)
##  time_in_hospital num_lab_procedures num_procedures   num_medications
##  Min.   : 1.000   Min.   : 31.0      Min.   :0.0000   Min.   : 1.00  
##  1st Qu.: 3.000   1st Qu.: 54.0      1st Qu.:0.0000   1st Qu.: 9.00  
##  Median : 5.000   Median : 63.0      Median :0.0000   Median :14.00  
##  Mean   : 5.398   Mean   : 64.2      Mean   :0.8443   Mean   :14.54  
##  3rd Qu.: 7.000   3rd Qu.: 74.0      3rd Qu.:1.0000   3rd Qu.:19.00  
##  Max.   :14.000   Max.   :106.0      Max.   :6.0000   Max.   :35.00  
##  number_outpatient number_emergency number_inpatient number_diagnoses
##  Min.   :0.0000    Min.   :0.000    Min.   :0.0000   Min.   :3.000   
##  1st Qu.:0.0000    1st Qu.:0.000    1st Qu.:0.0000   1st Qu.:5.000   
##  Median :0.0000    Median :0.000    Median :0.0000   Median :6.000   
##  Mean   :0.1592    Mean   :0.173    Mean   :0.6678   Mean   :5.958   
##  3rd Qu.:0.0000    3rd Qu.:0.000    3rd Qu.:1.0000   3rd Qu.:6.000   
##  Max.   :6.0000    Max.   :9.000    Max.   :9.0000   Max.   :9.000   
##   readmitted          diag_circ        diag_resp         diag_dig     
##  Length:289         Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  Class :character   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Mode  :character   Median :1.0000   Median :0.0000   Median :0.0000  
##                     Mean   :0.5363   Mean   :0.3114   Mean   :0.1211  
##                     3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##                     Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##    diag_diab         diag_inj         diag_musc         diag_geni     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.00000   Median :0.00000   Median :0.0000  
##  Mean   :0.6263   Mean   :0.06574   Mean   :0.04498   Mean   :0.1453  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##    diag_neop         diag_other     raceAfricanAmerican   raceAsian      
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000      Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000      1st Qu.:0.00000  
##  Median :0.00000   Median :1.0000   Median :0.0000      Median :0.00000  
##  Mean   :0.04152   Mean   :0.5467   Mean   :0.1765      Mean   :0.02422  
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.0000      3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000      Max.   :1.00000  
##  raceCaucasian     raceHispanic      raceOther        genderFemale   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :0.00000   Median :1.0000  
##  Mean   :0.6228   Mean   :0.1315   Mean   :0.04498   Mean   :0.5813  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##    genderMale        agefirst         agesecond          agethird      
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.4187   Mean   :0.01384   Mean   :0.01384   Mean   :0.05882  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##     ageforth         agefifth         agesixth        ageseventh    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.1315   Mean   :0.2249   Mean   :0.2076   Mean   :0.2076  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##    ageeighth      discharge_disposition_id1 discharge_disposition_id2
##  Min.   :0.0000   Min.   :0.0000            Min.   :0.0000           
##  1st Qu.:0.0000   1st Qu.:0.0000            1st Qu.:0.0000           
##  Median :0.0000   Median :1.0000            Median :0.0000           
##  Mean   :0.1419   Mean   :0.6574            Mean   :0.0692           
##  3rd Qu.:0.0000   3rd Qu.:1.0000            3rd Qu.:0.0000           
##  Max.   :1.0000   Max.   :1.0000            Max.   :1.0000           
##  discharge_disposition_id3 discharge_disposition_id5 discharge_disposition_id6
##  Min.   :0.0000            Min.   :0.0000            Min.   :0.00000          
##  1st Qu.:0.0000            1st Qu.:0.0000            1st Qu.:0.00000          
##  Median :0.0000            Median :0.0000            Median :0.00000          
##  Mean   :0.1142            Mean   :0.0173            Mean   :0.09343          
##  3rd Qu.:0.0000            3rd Qu.:0.0000            3rd Qu.:0.00000          
##  Max.   :1.0000            Max.   :1.0000            Max.   :1.00000          
##  discharge_disposition_id7 discharge_disposition_id10
##  Min.   :0.00000           Min.   :0.00000           
##  1st Qu.:0.00000           1st Qu.:0.00000           
##  Median :0.00000           Median :0.00000           
##  Mean   :0.03114           Mean   :0.00346           
##  3rd Qu.:0.00000           3rd Qu.:0.00000           
##  Max.   :1.00000           Max.   :1.00000           
##  discharge_disposition_id11 discharge_disposition_id13 admission_type_id1
##  Min.   :0.00000            Min.   :0.00000            Min.   :0.0000    
##  1st Qu.:0.00000            1st Qu.:0.00000            1st Qu.:0.0000    
##  Median :0.00000            Median :0.00000            Median :0.0000    
##  Mean   :0.01038            Mean   :0.00346            Mean   :0.1799    
##  3rd Qu.:0.00000            3rd Qu.:0.00000            3rd Qu.:0.0000    
##  Max.   :1.00000            Max.   :1.00000            Max.   :1.0000    
##  admission_type_id2 admission_type_id3 admission_type_id6 max_glu_serum200
##  Min.   :0.00000    Min.   :0.00000    Min.   :0.0000     Min.   :0.0000  
##  1st Qu.:0.00000    1st Qu.:0.00000    1st Qu.:1.0000     1st Qu.:0.0000  
##  Median :0.00000    Median :0.00000    Median :1.0000     Median :0.0000  
##  Mean   :0.01384    Mean   :0.00692    Mean   :0.7993     Mean   :0.2388  
##  3rd Qu.:0.00000    3rd Qu.:0.00000    3rd Qu.:1.0000     3rd Qu.:0.0000  
##  Max.   :1.00000    Max.   :1.00000    Max.   :1.0000     Max.   :1.0000  
##  max_glu_serum300 max_glu_serumNorm   A1Cresult7      A1Cresult8    
##  Min.   :0.0000   Min.   :0.0000    Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000    1st Qu.:0.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000    Median :0.000   Median :1.0000  
##  Mean   :0.4291   Mean   :0.3322    Mean   :0.218   Mean   :0.5917  
##  3rd Qu.:1.0000   3rd Qu.:1.0000    3rd Qu.:0.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000    Max.   :1.000   Max.   :1.0000  
##  A1CresultNorm    admission_source_id1 admission_source_id2
##  Min.   :0.0000   Min.   :0.00000      Min.   :0.00000     
##  1st Qu.:0.0000   1st Qu.:0.00000      1st Qu.:0.00000     
##  Median :0.0000   Median :0.00000      Median :0.00000     
##  Mean   :0.1903   Mean   :0.07958      Mean   :0.00692     
##  3rd Qu.:0.0000   3rd Qu.:0.00000      3rd Qu.:0.00000     
##  Max.   :1.0000   Max.   :1.00000      Max.   :1.00000     
##  admission_source_id7  insulinDown       insulinNo     insulinSteady   
##  Min.   :0.0000       Min.   :0.0000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:1.0000       1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :1.0000       Median :0.0000   Median :1.000   Median :0.0000  
##  Mean   :0.9135       Mean   :0.0692   Mean   :0.699   Mean   :0.1834  
##  3rd Qu.:1.0000       3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.:0.0000  
##  Max.   :1.0000       Max.   :1.0000   Max.   :1.000   Max.   :1.0000  
##    insulinUp          changeCh         changeNo      diabetesMedNo   
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.04844   Mean   :0.2907   Mean   :0.7093   Mean   :0.4083  
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  diabetesMedYes  
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :1.0000  
##  Mean   :0.5917  
##  3rd Qu.:1.0000  
##  Max.   :1.0000
nrow(diabetic_largedata)
## [1] 289
ncol(diabetic_largedata)
## [1] 63

#Bringing outcome variable to the last column

cbind(colnames(diabetic_largedata))
##       [,1]                        
##  [1,] "time_in_hospital"          
##  [2,] "num_lab_procedures"        
##  [3,] "num_procedures"            
##  [4,] "num_medications"           
##  [5,] "number_outpatient"         
##  [6,] "number_emergency"          
##  [7,] "number_inpatient"          
##  [8,] "number_diagnoses"          
##  [9,] "readmitted"                
## [10,] "diag_circ"                 
## [11,] "diag_resp"                 
## [12,] "diag_dig"                  
## [13,] "diag_diab"                 
## [14,] "diag_inj"                  
## [15,] "diag_musc"                 
## [16,] "diag_geni"                 
## [17,] "diag_neop"                 
## [18,] "diag_other"                
## [19,] "raceAfricanAmerican"       
## [20,] "raceAsian"                 
## [21,] "raceCaucasian"             
## [22,] "raceHispanic"              
## [23,] "raceOther"                 
## [24,] "genderFemale"              
## [25,] "genderMale"                
## [26,] "agefirst"                  
## [27,] "agesecond"                 
## [28,] "agethird"                  
## [29,] "ageforth"                  
## [30,] "agefifth"                  
## [31,] "agesixth"                  
## [32,] "ageseventh"                
## [33,] "ageeighth"                 
## [34,] "discharge_disposition_id1" 
## [35,] "discharge_disposition_id2" 
## [36,] "discharge_disposition_id3" 
## [37,] "discharge_disposition_id5" 
## [38,] "discharge_disposition_id6" 
## [39,] "discharge_disposition_id7" 
## [40,] "discharge_disposition_id10"
## [41,] "discharge_disposition_id11"
## [42,] "discharge_disposition_id13"
## [43,] "admission_type_id1"        
## [44,] "admission_type_id2"        
## [45,] "admission_type_id3"        
## [46,] "admission_type_id6"        
## [47,] "max_glu_serum200"          
## [48,] "max_glu_serum300"          
## [49,] "max_glu_serumNorm"         
## [50,] "A1Cresult7"                
## [51,] "A1Cresult8"                
## [52,] "A1CresultNorm"             
## [53,] "admission_source_id1"      
## [54,] "admission_source_id2"      
## [55,] "admission_source_id7"      
## [56,] "insulinDown"               
## [57,] "insulinNo"                 
## [58,] "insulinSteady"             
## [59,] "insulinUp"                 
## [60,] "changeCh"                  
## [61,] "changeNo"                  
## [62,] "diabetesMedNo"             
## [63,] "diabetesMedYes"
diabetic_largedata<-diabetic_largedata[c(1:8, 10:63, 9)]
head(diabetic_largedata)
##     time_in_hospital num_lab_procedures num_procedures num_medications
## 163                5                 47              1               6
## 461               10                 72              1              19
## 594                2                 61              0               5
## 697               11                 71              1              20
## 772               14                 43              0              11
## 824                7                105              3              16
##     number_outpatient number_emergency number_inpatient number_diagnoses
## 163                 0                0                0                5
## 461                 0                0                0                5
## 594                 0                0                0                5
## 697                 0                0                0                5
## 772                 0                0                0                3
## 824                 0                0                0                5
##     diag_circ diag_resp diag_dig diag_diab diag_inj diag_musc diag_geni
## 163         1         0        0         0        0         0         0
## 461         0         0        0         1        0         0         0
## 594         0         0        0         1        0         0         0
## 697         0         0        0         1        1         0         0
## 772         0         0        1         1        0         0         0
## 824         1         0        0         1        0         0         0
##     diag_neop diag_other raceAfricanAmerican raceAsian raceCaucasian
## 163         0          1                   0         0             1
## 461         0          1                   1         0             0
## 594         0          1                   0         0             1
## 697         0          0                   0         0             0
## 772         0          1                   0         0             1
## 824         0          1                   0         0             1
##     raceHispanic raceOther genderFemale genderMale agefirst agesecond agethird
## 163            0         0            0          1        0         0        0
## 461            0         0            1          0        0         0        0
## 594            0         0            1          0        0         0        0
## 697            0         1            0          1        0         0        0
## 772            0         0            1          0        0         0        1
## 824            0         0            0          1        0         0        0
##     ageforth agefifth agesixth ageseventh ageeighth discharge_disposition_id1
## 163        0        0        0          0         1                         0
## 461        0        0        0          1         0                         1
## 594        0        1        0          0         0                         1
## 697        0        0        0          1         0                         0
## 772        0        0        0          0         0                         1
## 824        0        0        0          0         1                         1
##     discharge_disposition_id2 discharge_disposition_id3
## 163                         0                         1
## 461                         0                         0
## 594                         0                         0
## 697                         0                         0
## 772                         0                         0
## 824                         0                         0
##     discharge_disposition_id5 discharge_disposition_id6
## 163                         0                         0
## 461                         0                         0
## 594                         0                         0
## 697                         0                         1
## 772                         0                         0
## 824                         0                         0
##     discharge_disposition_id7 discharge_disposition_id10
## 163                         0                          0
## 461                         0                          0
## 594                         0                          0
## 697                         0                          0
## 772                         0                          0
## 824                         0                          0
##     discharge_disposition_id11 discharge_disposition_id13 admission_type_id1
## 163                          0                          0                  0
## 461                          0                          0                  0
## 594                          0                          0                  0
## 697                          0                          0                  0
## 772                          0                          0                  0
## 824                          0                          0                  0
##     admission_type_id2 admission_type_id3 admission_type_id6 max_glu_serum200
## 163                  0                  0                  1                1
## 461                  0                  0                  1                0
## 594                  0                  0                  1                0
## 697                  0                  0                  1                1
## 772                  0                  0                  1                0
## 824                  0                  0                  1                0
##     max_glu_serum300 max_glu_serumNorm A1Cresult7 A1Cresult8 A1CresultNorm
## 163                0                 0          0          0             1
## 461                1                 0          0          1             0
## 594                1                 0          0          1             0
## 697                0                 0          1          0             0
## 772                0                 1          1          0             0
## 824                1                 0          1          0             0
##     admission_source_id1 admission_source_id2 admission_source_id7 insulinDown
## 163                    0                    0                    1           0
## 461                    0                    0                    1           0
## 594                    0                    0                    1           0
## 697                    0                    0                    1           0
## 772                    0                    1                    0           0
## 824                    0                    0                    1           0
##     insulinNo insulinSteady insulinUp changeCh changeNo diabetesMedNo
## 163         1             0         0        0        1             1
## 461         0             0         1        1        0             0
## 594         0             1         0        0        1             0
## 697         1             0         0        0        1             0
## 772         1             0         0        0        1             1
## 824         1             0         0        0        1             0
##     diabetesMedYes readmitted
## 163              0        YES
## 461              1        YES
## 594              1         NO
## 697              1         NO
## 772              0        YES
## 824              1        YES
cbind(colnames(diabetic_largedata))
##       [,1]                        
##  [1,] "time_in_hospital"          
##  [2,] "num_lab_procedures"        
##  [3,] "num_procedures"            
##  [4,] "num_medications"           
##  [5,] "number_outpatient"         
##  [6,] "number_emergency"          
##  [7,] "number_inpatient"          
##  [8,] "number_diagnoses"          
##  [9,] "diag_circ"                 
## [10,] "diag_resp"                 
## [11,] "diag_dig"                  
## [12,] "diag_diab"                 
## [13,] "diag_inj"                  
## [14,] "diag_musc"                 
## [15,] "diag_geni"                 
## [16,] "diag_neop"                 
## [17,] "diag_other"                
## [18,] "raceAfricanAmerican"       
## [19,] "raceAsian"                 
## [20,] "raceCaucasian"             
## [21,] "raceHispanic"              
## [22,] "raceOther"                 
## [23,] "genderFemale"              
## [24,] "genderMale"                
## [25,] "agefirst"                  
## [26,] "agesecond"                 
## [27,] "agethird"                  
## [28,] "ageforth"                  
## [29,] "agefifth"                  
## [30,] "agesixth"                  
## [31,] "ageseventh"                
## [32,] "ageeighth"                 
## [33,] "discharge_disposition_id1" 
## [34,] "discharge_disposition_id2" 
## [35,] "discharge_disposition_id3" 
## [36,] "discharge_disposition_id5" 
## [37,] "discharge_disposition_id6" 
## [38,] "discharge_disposition_id7" 
## [39,] "discharge_disposition_id10"
## [40,] "discharge_disposition_id11"
## [41,] "discharge_disposition_id13"
## [42,] "admission_type_id1"        
## [43,] "admission_type_id2"        
## [44,] "admission_type_id3"        
## [45,] "admission_type_id6"        
## [46,] "max_glu_serum200"          
## [47,] "max_glu_serum300"          
## [48,] "max_glu_serumNorm"         
## [49,] "A1Cresult7"                
## [50,] "A1Cresult8"                
## [51,] "A1CresultNorm"             
## [52,] "admission_source_id1"      
## [53,] "admission_source_id2"      
## [54,] "admission_source_id7"      
## [55,] "insulinDown"               
## [56,] "insulinNo"                 
## [57,] "insulinSteady"             
## [58,] "insulinUp"                 
## [59,] "changeCh"                  
## [60,] "changeNo"                  
## [61,] "diabetesMedNo"             
## [62,] "diabetesMedYes"            
## [63,] "readmitted"

#Omitting the columns that their variance is near to 0 like 0.01

#lapply(data.frame(dummy1),function(x)table(x))
#some of the columns has so many 0s
table_list <- lapply(data.frame(dummy1), function(x) table(x))
plot_table <- function(tbl, var_name) {
  barplot(tbl, main = var_name, col = rainbow(length(tbl)), 
          xlab = "", ylab = "", border = NA)
  legend("topright", legend = names(tbl), fill = rainbow(length(tbl)), 
         bty = "n", cex = 0.8)
}

mapply(plot_table, table_list, var_name = colnames(dummy1))

##      race.AfricanAmerican race.Asian race.Caucasian race.Hispanic race.Other
## rect list,4               list,4     list,4         list,4        list,4    
## text list,2               list,2     list,2         list,2        list,2    
##      gender.Female gender.Male agefirst agesecond agethird ageforth agefifth
## rect list,4        list,4      list,4   list,4    list,4   list,4   list,4  
## text list,2        list,2      list,2   list,2    list,2   list,2   list,2  
##      agesixth ageseventh ageeighth discharge_disposition_id.1
## rect list,4   list,4     list,4    list,4                    
## text list,2   list,2     list,2    list,2                    
##      discharge_disposition_id.2 discharge_disposition_id.3
## rect list,4                     list,4                    
## text list,2                     list,2                    
##      discharge_disposition_id.5 discharge_disposition_id.6
## rect list,4                     list,4                    
## text list,2                     list,2                    
##      discharge_disposition_id.7 discharge_disposition_id.10
## rect list,4                     list,4                     
## text list,2                     list,2                     
##      discharge_disposition_id.11 discharge_disposition_id.13
## rect list,4                      list,4                     
## text list,2                      list,2                     
##      admission_type_id.1 admission_type_id.2 admission_type_id.3
## rect list,4              list,4              list,4             
## text list,2              list,2              list,2             
##      admission_type_id.6 max_glu_serum200 max_glu_serum300 max_glu_serum.Norm
## rect list,4              list,4           list,4           list,4            
## text list,2              list,2           list,2           list,2            
##      A1Cresult7 A1Cresult8 A1Cresult.Norm admission_source_id.1
## rect list,4     list,4     list,4         list,4               
## text list,2     list,2     list,2         list,2               
##      admission_source_id.2 admission_source_id.7 insulin.Down insulin.No
## rect list,4                list,4                list,4       list,4    
## text list,2                list,2                list,2       list,2    
##      insulin.Steady insulin.Up change.Ch change.No diabetesMed.No
## rect list,4         list,4     list,4    list,4    list,4        
## text list,2         list,2     list,2    list,2    list,2        
##      diabetesMed.Yes
## rect list,4         
## text list,2
lapply(data.frame(dummy1),function(x)var(x))
## $race.AfricanAmerican
## [1] 0.1458333
## 
## $race.Asian
## [1] 0.02371684
## 
## $race.Caucasian
## [1] 0.2357266
## 
## $race.Hispanic
## [1] 0.1145953
## 
## $race.Other
## [1] 0.04310842
## 
## $gender.Female
## [1] 0.244233
## 
## $gender.Male
## [1] 0.244233
## 
## $agefirst
## [1] 0.01369666
## 
## $agesecond
## [1] 0.01369666
## 
## $agethird
## [1] 0.05555556
## 
## $ageforth
## [1] 0.1145953
## 
## $agefifth
## [1] 0.1749327
## 
## $agesixth
## [1] 0.1650807
## 
## $ageseventh
## [1] 0.1650807
## 
## $ageeighth
## [1] 0.1221646
## 
## $discharge_disposition_id.1
## [1] 0.2259948
## 
## $discharge_disposition_id.2
## [1] 0.0646386
## 
## $discharge_disposition_id.3
## [1] 0.1014994
## 
## $discharge_disposition_id.5
## [1] 0.01706075
## 
## $discharge_disposition_id.6
## [1] 0.08499135
## 
## $discharge_disposition_id.7
## [1] 0.03027682
## 
## $discharge_disposition_id.10
## [1] 0.003460208
## 
## $discharge_disposition_id.11
## [1] 0.01030854
## 
## $discharge_disposition_id.13
## [1] 0.003460208
## 
## $admission_type_id.1
## [1] 0.1480681
## 
## $admission_type_id.2
## [1] 0.01369666
## 
## $admission_type_id.3
## [1] 0.006896386
## 
## $admission_type_id.6
## [1] 0.1609717
## 
## $max_glu_serum200
## [1] 0.1823818
## 
## $max_glu_serum300
## [1] 0.2458189
## 
## $max_glu_serum.Norm
## [1] 0.2226067
## 
## $A1Cresult7
## [1] 0.171064
## 
## $A1Cresult8
## [1] 0.2424308
## 
## $A1Cresult.Norm
## [1] 0.154628
## 
## $admission_source_id.1
## [1] 0.07350538
## 
## $admission_source_id.2
## [1] 0.006896386
## 
## $admission_source_id.7
## [1] 0.07929642
## 
## $insulin.Down
## [1] 0.0646386
## 
## $insulin.No
## [1] 0.2111448
## 
## $insulin.Steady
## [1] 0.1502787
## 
## $insulin.Up
## [1] 0.04625625
## 
## $change.Ch
## [1] 0.2068916
## 
## $change.No
## [1] 0.2068916
## 
## $diabetesMed.No
## [1] 0.2424308
## 
## $diabetesMed.Yes
## [1] 0.2424308
#omiting the columns with the nearZero variance
variances <- lapply(diabetic_largedata,function(x)var(x))
near_zero <- which(variances < 0.05)
colnames(diabetic_largedata)[near_zero]
##  [1] "diag_musc"                  "diag_neop"                 
##  [3] "raceAsian"                  "raceOther"                 
##  [5] "agefirst"                   "agesecond"                 
##  [7] "discharge_disposition_id5"  "discharge_disposition_id7" 
##  [9] "discharge_disposition_id10" "discharge_disposition_id11"
## [11] "discharge_disposition_id13" "admission_type_id2"        
## [13] "admission_type_id3"         "admission_source_id2"      
## [15] "insulinUp"
cols_to_remove <- c("diag_musc","diag_neop","raceAsian","raceOther","agefirst","agesecond","discharge_disposition_id5","discharge_disposition_id7","discharge_disposition_id10","discharge_disposition_id11","discharge_disposition_id13","admission_type_id2","admission_type_id3","admission_source_id2","insulinUp")

diabetic_largedata <- diabetic_largedata[, !(colnames(diabetic_largedata) %in% cols_to_remove)]
cbind(colnames(diabetic_largedata))
##       [,1]                       
##  [1,] "time_in_hospital"         
##  [2,] "num_lab_procedures"       
##  [3,] "num_procedures"           
##  [4,] "num_medications"          
##  [5,] "number_outpatient"        
##  [6,] "number_emergency"         
##  [7,] "number_inpatient"         
##  [8,] "number_diagnoses"         
##  [9,] "diag_circ"                
## [10,] "diag_resp"                
## [11,] "diag_dig"                 
## [12,] "diag_diab"                
## [13,] "diag_inj"                 
## [14,] "diag_geni"                
## [15,] "diag_other"               
## [16,] "raceAfricanAmerican"      
## [17,] "raceCaucasian"            
## [18,] "raceHispanic"             
## [19,] "genderFemale"             
## [20,] "genderMale"               
## [21,] "agethird"                 
## [22,] "ageforth"                 
## [23,] "agefifth"                 
## [24,] "agesixth"                 
## [25,] "ageseventh"               
## [26,] "ageeighth"                
## [27,] "discharge_disposition_id1"
## [28,] "discharge_disposition_id2"
## [29,] "discharge_disposition_id3"
## [30,] "discharge_disposition_id6"
## [31,] "admission_type_id1"       
## [32,] "admission_type_id6"       
## [33,] "max_glu_serum200"         
## [34,] "max_glu_serum300"         
## [35,] "max_glu_serumNorm"        
## [36,] "A1Cresult7"               
## [37,] "A1Cresult8"               
## [38,] "A1CresultNorm"            
## [39,] "admission_source_id1"     
## [40,] "admission_source_id7"     
## [41,] "insulinDown"              
## [42,] "insulinNo"                
## [43,] "insulinSteady"            
## [44,] "changeCh"                 
## [45,] "changeNo"                 
## [46,] "diabetesMedNo"            
## [47,] "diabetesMedYes"           
## [48,] "readmitted"
nzv_cols <- nearZeroVar(diabetic_largedata[, -48], saveMetrics = TRUE)$nzv
cbind(nzv_cols)
##       nzv_cols
##  [1,]    FALSE
##  [2,]    FALSE
##  [3,]    FALSE
##  [4,]    FALSE
##  [5,]     TRUE
##  [6,]     TRUE
##  [7,]    FALSE
##  [8,]    FALSE
##  [9,]    FALSE
## [10,]    FALSE
## [11,]    FALSE
## [12,]    FALSE
## [13,]    FALSE
## [14,]    FALSE
## [15,]    FALSE
## [16,]    FALSE
## [17,]    FALSE
## [18,]    FALSE
## [19,]    FALSE
## [20,]    FALSE
## [21,]    FALSE
## [22,]    FALSE
## [23,]    FALSE
## [24,]    FALSE
## [25,]    FALSE
## [26,]    FALSE
## [27,]    FALSE
## [28,]    FALSE
## [29,]    FALSE
## [30,]    FALSE
## [31,]    FALSE
## [32,]    FALSE
## [33,]    FALSE
## [34,]    FALSE
## [35,]    FALSE
## [36,]    FALSE
## [37,]    FALSE
## [38,]    FALSE
## [39,]    FALSE
## [40,]    FALSE
## [41,]    FALSE
## [42,]    FALSE
## [43,]    FALSE
## [44,]    FALSE
## [45,]    FALSE
## [46,]    FALSE
## [47,]    FALSE
diabetic_largedata<-diabetic_largedata[,-c(5,6)]
str(diabetic_largedata)
## 'data.frame':    289 obs. of  46 variables:
##  $ time_in_hospital         : int  5 10 2 11 14 7 2 3 2 4 ...
##  $ num_lab_procedures       : int  47 72 61 71 43 105 66 76 43 41 ...
##  $ num_procedures           : int  1 1 0 1 0 3 0 0 0 1 ...
##  $ num_medications          : int  6 19 5 20 11 16 3 9 13 8 ...
##  $ number_inpatient         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_diagnoses         : int  5 5 5 5 3 5 3 5 5 3 ...
##  $ diag_circ                : num  1 0 0 0 0 1 1 0 0 0 ...
##  $ diag_resp                : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ diag_dig                 : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ diag_diab                : num  0 1 1 1 1 1 1 0 1 1 ...
##  $ diag_inj                 : num  0 0 0 1 0 0 0 0 1 0 ...
##  $ diag_geni                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_other               : num  1 1 1 0 1 1 1 1 1 0 ...
##  $ raceAfricanAmerican      : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ raceCaucasian            : num  1 0 1 0 1 1 0 1 1 1 ...
##  $ raceHispanic             : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ genderFemale             : num  0 1 1 0 1 0 1 1 1 0 ...
##  $ genderMale               : num  1 0 0 1 0 1 0 0 0 1 ...
##  $ agethird                 : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ ageforth                 : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ agefifth                 : num  0 0 1 0 0 0 1 0 1 1 ...
##  $ agesixth                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ageseventh               : num  0 1 0 1 0 0 0 0 0 0 ...
##  $ ageeighth                : num  1 0 0 0 0 1 0 0 0 0 ...
##  $ discharge_disposition_id1: num  0 1 1 0 1 1 1 1 1 0 ...
##  $ discharge_disposition_id2: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id3: num  1 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id6: num  0 0 0 1 0 0 0 0 0 0 ...
##  $ admission_type_id1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id6       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ max_glu_serum200         : num  1 0 0 1 0 0 0 0 0 1 ...
##  $ max_glu_serum300         : num  0 1 1 0 0 1 0 1 1 0 ...
##  $ max_glu_serumNorm        : num  0 0 0 0 1 0 1 0 0 0 ...
##  $ A1Cresult7               : num  0 0 0 1 1 1 1 1 1 0 ...
##  $ A1Cresult8               : num  0 1 1 0 0 0 0 0 0 1 ...
##  $ A1CresultNorm            : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ admission_source_id1     : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ admission_source_id7     : num  1 1 1 1 0 1 1 1 1 0 ...
##  $ insulinDown              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ insulinNo                : num  1 0 0 1 1 1 1 1 1 1 ...
##  $ insulinSteady            : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ changeCh                 : num  0 1 0 0 0 0 0 1 0 0 ...
##  $ changeNo                 : num  1 0 1 1 1 1 1 0 1 1 ...
##  $ diabetesMedNo            : num  1 0 0 0 1 0 0 0 1 0 ...
##  $ diabetesMedYes           : num  0 1 1 1 0 1 1 1 0 1 ...
##  $ readmitted               : chr  "YES" "YES" "NO" "NO" ...

#Test & Train Split

library(caTools)
set.seed(123)
split<-sample.split(diabetic_largedata$readmitted,SplitRatio=0.8)
train_diabetic_largedata<-subset(diabetic_largedata,split==T)
test_diabetic_largedata<-subset(diabetic_largedata,split==F)
sum(nrow(train_diabetic_largedata),nrow(test_diabetic_largedata))
## [1] 289

#Feature Scaling

library(caret)
normParam <- preProcess(train_diabetic_largedata, method = c("center", "scale"))
train_diabetic_largedata <- predict(normParam, train_diabetic_largedata)
test_diabetic_largedata<- predict(normParam,test_diabetic_largedata)
head(train_diabetic_largedata)
##      time_in_hospital num_lab_procedures num_procedures num_medications
## 163        -0.1505286         -1.2039769      0.1168388      -1.1571953
## 594        -1.1346449         -0.2232058     -0.6769776      -1.2905846
## 697         1.8177039          0.4773450      0.1168388       0.7102546
## 772         2.8018202         -1.4841972     -0.6769776      -0.4902489
## 1281       -0.8066061          0.8276204     -0.6769776      -0.7570275
## 1756       -1.1346449         -1.4841972     -0.6769776      -0.2234704
##      number_inpatient number_diagnoses  diag_circ  diag_resp   diag_dig
## 163        -0.5275243       -0.6526203  0.9188738 -0.6647022 -0.3705859
## 594        -0.5275243       -0.6526203 -1.0835776 -0.6647022 -0.3705859
## 697        -0.5275243       -0.6526203 -1.0835776 -0.6647022 -0.3705859
## 772        -0.5275243       -1.9867378 -1.0835776 -0.6647022  2.6867480
## 1281       -0.5275243       -0.6526203 -1.0835776 -0.6647022 -0.3705859
## 1756       -0.5275243       -0.6526203 -1.0835776 -0.6647022 -0.3705859
##       diag_diab  diag_inj  diag_geni diag_other raceAfricanAmerican
## 163  -1.2956649 -0.281239 -0.4357706  0.9029688          -0.4073637
## 594   0.7684633 -0.281239 -0.4357706  0.9029688          -0.4073637
## 697   0.7684633  3.540302 -0.4357706 -1.1026638          -0.4073637
## 772   0.7684633 -0.281239 -0.4357706  0.9029688          -0.4073637
## 1281 -1.2956649 -0.281239 -0.4357706  0.9029688          -0.4073637
## 1756  0.7684633  3.540302 -0.4357706  0.9029688          -0.4073637
##      raceCaucasian raceHispanic genderFemale genderMale   agethird   ageforth
## 163      0.7124584   -0.3928473   -1.1624404  1.1624404 -0.2436696 -0.4001349
## 594      0.7124584   -0.3928473    0.8565351 -0.8565351 -0.2436696 -0.4001349
## 697     -1.3975146   -0.3928473   -1.1624404  1.1624404 -0.2436696 -0.4001349
## 772      0.7124584   -0.3928473    0.8565351 -0.8565351  4.0861518 -0.4001349
## 1281     0.7124584   -0.3928473    0.8565351 -0.8565351 -0.2436696  2.4883386
## 1756     0.7124584   -0.3928473    0.8565351 -0.8565351 -0.2436696 -0.4001349
##        agefifth   agesixth ageseventh  ageeighth discharge_disposition_id1
## 163  -0.5043104 -0.5043104 -0.5244495  2.3613042                -1.3450677
## 594   1.9743217 -0.5043104 -0.5244495 -0.4216615                 0.7402386
## 697  -0.5043104 -0.5043104  1.8985070 -0.4216615                -1.3450677
## 772  -0.5043104 -0.5043104 -0.5244495 -0.4216615                 0.7402386
## 1281 -0.5043104 -0.5043104 -0.5244495 -0.4216615                 0.7402386
## 1756  1.9743217 -0.5043104 -0.5244495 -0.4216615                 0.7402386
##      discharge_disposition_id2 discharge_disposition_id3
## 163                 -0.2629521                 2.7427810
## 594                 -0.2629521                -0.3630151
## 697                 -0.2629521                -0.3630151
## 772                 -0.2629521                -0.3630151
## 1281                -0.2629521                -0.3630151
## 1756                -0.2629521                -0.3630151
##      discharge_disposition_id6 admission_type_id1 admission_type_id6
## 163                 -0.3397648         -0.4703831          0.5110378
## 594                 -0.3397648         -0.4703831          0.5110378
## 697                  2.9304714         -0.4703831          0.5110378
## 772                 -0.3397648         -0.4703831          0.5110378
## 1281                -0.3397648         -0.4703831          0.5110378
## 1756                -0.3397648         -0.4703831          0.5110378
##      max_glu_serum200 max_glu_serum300 max_glu_serumNorm A1Cresult7 A1Cresult8
## 163         1.7639365       -0.8414468        -0.7193656 -0.5511479 -1.1832846
## 594        -0.5644597        1.1832846        -0.7193656 -0.5511479  0.8414468
## 697         1.7639365       -0.8414468        -0.7193656  1.8065404 -1.1832846
## 772        -0.5644597       -0.8414468         1.3840959  1.8065404 -1.1832846
## 1281       -0.5644597        1.1832846        -0.7193656  1.8065404 -1.1832846
## 1756       -0.5644597        1.1832846        -0.7193656  1.8065404 -1.1832846
##      A1CresultNorm admission_source_id1 admission_source_id7 insulinDown
## 163      2.1167238           -0.3155425            0.3318105  -0.2534499
## 594     -0.4703831           -0.3155425            0.3318105  -0.2534499
## 697     -0.4703831           -0.3155425            0.3318105  -0.2534499
## 772     -0.4703831           -0.3155425           -3.0007214  -0.2534499
## 1281    -0.4703831           -0.3155425            0.3318105  -0.2534499
## 1756    -0.4703831           -0.3155425            0.3318105  -0.2534499
##       insulinNo insulinSteady   changeCh   changeNo diabetesMedNo
## 163   0.6243966    -0.4497173 -0.6177172  0.6177172     1.1521985
## 594  -1.5946130     2.2139930 -0.6177172  0.6177172    -0.8641489
## 697   0.6243966    -0.4497173 -0.6177172  0.6177172    -0.8641489
## 772   0.6243966    -0.4497173 -0.6177172  0.6177172     1.1521985
## 1281  0.6243966    -0.4497173  1.6118558 -1.6118558    -0.8641489
## 1756  0.6243966    -0.4497173 -0.6177172  0.6177172     1.1521985
##      diabetesMedYes readmitted
## 163      -1.1521985        YES
## 594       0.8641489         NO
## 697       0.8641489         NO
## 772      -1.1521985        YES
## 1281      0.8641489        YES
## 1756     -1.1521985        YES
#install.packages("reshape2")
library(reshape2)
library("ggplot2")
plot_heatmap <- function(data, outcome_var) {
 
  data_numeric <- data[, sapply(data, is.numeric)]

  corr_matrix <- cor(data_numeric)
  ggplot(data = melt(corr_matrix), aes(x = Var2, y = Var1, fill = value)) +
    geom_tile() +
    scale_fill_gradient2(low = "#f7fbff", high = "#08306b", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Correlation") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 90, vjust = 1, size = 10, hjust = 1)) +
    ggtitle(paste("Correlation Heatmap of", ncol(data_numeric), "Numeric Variables"))
}

plot_heatmap(train_diabetic_largedata, readmitted)

train_diabetic_largedata$readmitted<-as.factor(train_diabetic_largedata$readmitted)
test_diabetic_largedata$readmitted<-as.factor(test_diabetic_largedata$readmitted)
str(train_diabetic_largedata$readmitted)
##  Factor w/ 2 levels "NO","YES": 2 1 1 2 2 2 2 2 1 2 ...
table(train_diabetic_largedata$readmitted)
## 
##  NO YES 
##  93 138
cbind(colnames(diabetic_largedata))
##       [,1]                       
##  [1,] "time_in_hospital"         
##  [2,] "num_lab_procedures"       
##  [3,] "num_procedures"           
##  [4,] "num_medications"          
##  [5,] "number_inpatient"         
##  [6,] "number_diagnoses"         
##  [7,] "diag_circ"                
##  [8,] "diag_resp"                
##  [9,] "diag_dig"                 
## [10,] "diag_diab"                
## [11,] "diag_inj"                 
## [12,] "diag_geni"                
## [13,] "diag_other"               
## [14,] "raceAfricanAmerican"      
## [15,] "raceCaucasian"            
## [16,] "raceHispanic"             
## [17,] "genderFemale"             
## [18,] "genderMale"               
## [19,] "agethird"                 
## [20,] "ageforth"                 
## [21,] "agefifth"                 
## [22,] "agesixth"                 
## [23,] "ageseventh"               
## [24,] "ageeighth"                
## [25,] "discharge_disposition_id1"
## [26,] "discharge_disposition_id2"
## [27,] "discharge_disposition_id3"
## [28,] "discharge_disposition_id6"
## [29,] "admission_type_id1"       
## [30,] "admission_type_id6"       
## [31,] "max_glu_serum200"         
## [32,] "max_glu_serum300"         
## [33,] "max_glu_serumNorm"        
## [34,] "A1Cresult7"               
## [35,] "A1Cresult8"               
## [36,] "A1CresultNorm"            
## [37,] "admission_source_id1"     
## [38,] "admission_source_id7"     
## [39,] "insulinDown"              
## [40,] "insulinNo"                
## [41,] "insulinSteady"            
## [42,] "changeCh"                 
## [43,] "changeNo"                 
## [44,] "diabetesMedNo"            
## [45,] "diabetesMedYes"           
## [46,] "readmitted"

Unsupervised Learning

##PCA

set.seed(123)
PCA <- prcomp(train_diabetic_largedata[,-46],
                 center = TRUE,
                 scale = TRUE) 
library("factoextra")
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library("factoextra")
get_eigenvalue(PCA)
##                                     eigenvalue
## Dim.1  4.6598325031985474709017580607905983925
## Dim.2  3.7805646840861397706134994223248213530
## Dim.3  2.9187897831060576336881240422371774912
## Dim.4  2.4503496574654932693704267876455560327
## Dim.5  2.3383766737560160997588809550506994128
## Dim.6  2.1227540207801922633734648115932941437
## Dim.7  1.9689523355885441002754987493972294033
## Dim.8  1.7768202520662772503357018649694509804
## Dim.9  1.6556170991362784139511177272652275860
## Dim.10 1.5769560456009974469537837649113498628
## Dim.11 1.5441741334508187755147901043528690934
## Dim.12 1.2948678670632081111335764944669790566
## Dim.13 1.2579494148627754501745812376611866057
## Dim.14 1.2280233243125018294250594408367760479
## Dim.15 1.1665266791117452349624272756045684218
## Dim.16 1.1079529810168642356416057737078517675
## Dim.17 1.0898202757579218413752641936298459768
## Dim.18 0.9939596665362285810019216114596929401
## Dim.19 0.9667172885340085120020603426382876933
## Dim.20 0.8810216142539648576104127641883678734
## Dim.21 0.8306493640521940280407875434320885688
## Dim.22 0.7782969625701783300542047072667628527
## Dim.23 0.7335514638375298934747092971520032734
## Dim.24 0.7082867909546097795470132041373290122
## Dim.25 0.6874449211291094874454188357049133629
## Dim.26 0.6554286266942238592392300233768764883
## Dim.27 0.6256500368707126336076385086926165968
## Dim.28 0.5727451562829668496590329596074298024
## Dim.29 0.5048664634834265640961348253767937422
## Dim.30 0.4744654246252194562671888888871762902
## Dim.31 0.4179866839151417101660968000942375511
## Dim.32 0.2896741405003985803112698249606182799
## Dim.33 0.2803808188302840398442583591531729326
## Dim.34 0.2511128013155019544910828699357807636
## Dim.35 0.1200714279244419480097150199071620591
## Dim.36 0.1023118327140242755923793538386235014
## Dim.37 0.0785489994747356201632015881841653027
## Dim.38 0.0414145218953088020685804337972513167
## Dim.39 0.0377707901635429832420953744076541625
## Dim.40 0.0293164730818278491952799669206797262
## Dim.41 0.0000000000000000000000000000009189899
## Dim.42 0.0000000000000000000000000000001620751
## Dim.43 0.0000000000000000000000000000001418502
## Dim.44 0.0000000000000000000000000000001237976
## Dim.45 0.0000000000000000000000000000001133062
##                                variance.percent cumulative.variance.percent
## Dim.1  10.3551833404412310102316041593439877033                    10.35518
## Dim.2   8.4012548535247670855596879846416413784                    18.75644
## Dim.3   6.4861995180134703886665192840155214071                    25.24264
## Dim.4   5.4452214610344373824091235292144119740                    30.68786
## Dim.5   5.1963926083467093519629997899755835533                    35.88425
## Dim.6   4.7172311572893228515113150933757424355                    40.60148
## Dim.7   4.3754496346412157237182327662594616413                    44.97693
## Dim.8   3.9484894490361770458264345506904646754                    48.92542
## Dim.9   3.6791491091917354339102530502714216709                    52.60457
## Dim.10  3.5043467680022217791702132672071456909                    56.10892
## Dim.11  3.4314980743351579661748473881743848324                    59.54042
## Dim.12  2.8774841490293550094747843104414641857                    62.41790
## Dim.13  2.7954431441395048985043558786856010556                    65.21334
## Dim.14  2.7289407206944522599201263801660388708                    67.94228
## Dim.15  2.5922815091372153339932538074208423495                    70.53457
## Dim.16  2.4621177355930350394430661253863945603                    72.99668
## Dim.17  2.4218228350176076446587103419005870819                    75.41851
## Dim.18  2.2087992589694001033251424814807251096                    77.62731
## Dim.19  2.1482606411866886908512697118567302823                    79.77557
## Dim.20  1.9578258094532581257141146124922670424                    81.73339
## Dim.21  1.8458874756715448750554742218810133636                    83.57928
## Dim.22  1.7295488057115098179394863109337165952                    85.30883
## Dim.23  1.6301143640834021564245404078974388540                    86.93894
## Dim.24  1.5739706465658018785802596539724618196                    88.51291
## Dim.25  1.5276553802869121678753572268760763109                    90.04057
## Dim.26  1.4565080593204995373923793522408232093                    91.49708
## Dim.27  1.3903334152682522706356849084841087461                    92.88741
## Dim.28  1.2727670139621503064830676521523855627                    94.16018
## Dim.29  1.1219254744076163632371390121988952160                    95.28210
## Dim.30  1.0543676102782670866986336477566510439                    96.33647
## Dim.31  0.9288592975892050462860538573295343667                    97.26533
## Dim.32  0.6437203122231088814331201319873798639                    97.90905
## Dim.33  0.6230684862895209397137818996270652860                    98.53212
## Dim.34  0.5580284473677828982474125041335355490                    99.09015
## Dim.35  0.2668253953876491557650751929031684995                    99.35697
## Dim.36  0.2273596282533875967413194985056179576                    99.58433
## Dim.37  0.1745533321660794057184062921805889346                    99.75888
## Dim.38  0.0920322708784641341228649480399326421                    99.85092
## Dim.39  0.0839350892523178654380799912360089365                    99.93485
## Dim.40  0.0651477179596175320908102435168984812                   100.00000
## Dim.41  0.0000000000000000000000000000020421998                   100.00000
## Dim.42  0.0000000000000000000000000000003601669                   100.00000
## Dim.43  0.0000000000000000000000000000003152227                   100.00000
## Dim.44  0.0000000000000000000000000000002751057                   100.00000
## Dim.45  0.0000000000000000000000000000002517916                   100.00000
fviz_eig(PCA)

#Since 80% of information covered by the fist 20 columns so I'll go for pcaComp 20! > Because I've put dummyies in the pca I get this result
PCa<-preProcess(x = train_diabetic_largedata[-46],method="pca", pcaComp = 20)
PCa_train_diabetic_largedata<-predict(PCa, train_diabetic_largedata)
PCa_test_diabetic_largedata<-predict(PCa, test_diabetic_largedata)
head(PCa_train_diabetic_largedata)
##      readmitted        PC1        PC2        PC3        PC4        PC5
## 163         YES -2.8746675  0.7110215  1.4531339  1.2768709  0.6116170
## 594          NO  1.0047112  0.9623812 -1.4979231 -1.4719208 -1.7497251
## 697          NO -0.2396595  0.7115938  2.0164012 -0.5031467  0.4535689
## 772         YES -2.8020098 -0.1759601 -0.5212236  0.3838711 -2.6209025
## 1281        YES  1.0467211  1.2756512 -0.4347498 -0.1801328 -1.1873455
## 1756        YES -2.0003096 -0.9507301 -1.1226754 -1.1025223 -1.2133094
##             PC6        PC7         PC8        PC9        PC10         PC11
## 163  -1.3751556  0.5647002  2.59165638  0.7118289 -1.69793155 -0.441464220
## 594  -0.9456402 -1.4400853 -0.32813288 -0.1395738  1.81199251  0.383967989
## 697  -1.3666440  0.6366796 -0.29735148  1.0305483  0.59569349  4.524702427
## 772  -2.7203187 -0.4189597  0.07812654  1.6150231 -0.02179667 -0.004352256
## 1281  0.2137869 -2.1545803 -0.22339069  1.3773906 -1.61826251  0.390024649
## 1756 -1.0791650 -2.3325741 -0.32572860  0.3063417 -0.82310510  2.929151963
##            PC12       PC13       PC14       PC15        PC16       PC17
## 163   1.5370893  1.7462433  0.3644117 -1.2999932 -0.39089910 -0.7931275
## 594   0.5768248  0.4296597  1.0715284 -1.4379110  0.07618682  1.1314236
## 697   0.6886590  1.0427219 -0.6063008  0.8489401 -0.40155507 -0.7536306
## 772  -1.3871150 -3.9246498  1.1085770  0.8720854  1.73595813  1.5415211
## 1281 -0.6938338 -0.4167629 -1.2436965  0.8513527 -1.21060991  0.6669665
## 1756 -1.1137953  0.0199972  0.9094216 -0.1513736 -0.89725268  1.0215179
##             PC18       PC19       PC20
## 163  -0.03895583  0.2828857 -0.2667563
## 594  -1.49641277 -0.3991312  1.5719328
## 697   1.94521252 -0.8059263 -2.8164233
## 772   1.29831147 -0.1996476 -1.4543981
## 1281 -0.83040544 -0.2947236 -0.4508713
## 1756  0.03379931 -1.0123100 -0.2092783
train_diabetic_largedata$readmitted <- as.factor(ifelse(train_diabetic_largedata$readmitted == "YES", "1", "0"))
test_diabetic_largedata$readmitted<-as.factor(ifelse(test_diabetic_largedata$readmitted=="YES","1","0"))
table(train_diabetic_largedata$readmitted)
## 
##   0   1 
##  93 138
table(test_diabetic_largedata$readmitted)
## 
##  0  1 
## 23 35
str(train_diabetic_largedata$readmitted)
##  Factor w/ 2 levels "0","1": 2 1 1 2 2 2 2 2 1 2 ...
str(test_diabetic_largedata$readmitted)
##  Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 1 1 ...
PCa_train_diabetic_largedata$readmitted <- as.factor(ifelse(PCa_train_diabetic_largedata$readmitted == "YES", "1", "0"))
PCa_test_diabetic_largedata$readmitted<-as.factor(ifelse(PCa_test_diabetic_largedata$readmitted=="YES","1","0"))
table(PCa_train_diabetic_largedata$readmitted)
## 
##   0   1 
##  93 138
table(PCa_test_diabetic_largedata$readmitted)
## 
##  0  1 
## 23 35
str(PCa_train_diabetic_largedata$readmitted)
##  Factor w/ 2 levels "0","1": 2 1 1 2 2 2 2 2 1 2 ...
str(PCa_test_diabetic_largedata$readmitted)
##  Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 1 1 ...

##K-means Clustering

# In this dataset we don't have technically any variables that may need unsupervised learning (like comment or unlabeled data) so I just practice the clustering on train data but I know that in clustering we don't need to split the test and train 
wcss = vector()
for (i in 1:10){
    model_kmeans = kmeans(train_diabetic_largedata, i)
    wcss[i] = sum(model_kmeans$withinss)
}

plot(1:10,
     wcss,
     type = 'b',
     main = paste('The Elbow Method'),
     xlab = 'Number of clusters',
     ylab = 'WCSS')

#based on elbow method two clusters make sence
library(cluster)
model_kmeans<- kmeans(train_diabetic_largedata, 3)
y_kmeans<- model_kmeans$cluster
kmeans_model_kmeans = kmeans(x = train_diabetic_largedata, centers = 3)
clusplot(train_diabetic_largedata,
         y_kmeans,
         lines = 0,
         shade = TRUE,
         color = TRUE,
         labels = 2,
         plotchar = FALSE,
         span = TRUE,
         main = 'Clusters of patients',
         xlab = 'Xlab',
         ylab = 'Ylab')

##Hierachical Clustering ON PCA data

hc <- hclust(d = dist(PCa_train_diabetic_largedata, method = 'euclidean'), method = 'ward.D')
plot(hc,
     main = 'Dendrogram',
     xlab = 'Customers',
     ylab = 'Euclidean distances')

#install.packages("factoextra")
library(factoextra)
fviz_nbclust(PCa_train_diabetic_largedata,  kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")

#based on silhouette method 2 number of clusters is recommended
fviz_nbclust(PCa_train_diabetic_largedata[,-1], kmeans, nstart = 1,  method = "gap_stat", nboot = 50)+
  labs(subtitle = "Gap statistic method")

fviz_nbclust(PCa_train_diabetic_largedata[,-1], hcut, nstart = 1,  method = "gap_stat", nboot = 50)+
  labs(subtitle = "Gap statistic method")

#based on the graph optimal number of cluster are different from 2 3 and 8 ! but based on dendogram 3 make sence
#I'll go for 2 clusters
y_hc <-cutree(hc, 3)
library(cluster)
clusplot(train_diabetic_largedata,
         y_hc,
         lines = 0,
         shade = TRUE,
         color = TRUE,
         labels= 2,
         plotchar = FALSE,
         span = TRUE,
         main = 'Clusters of customers',
         xlab = 'Xlab',
         ylab = 'Ylab')

Suppervised learning

#MODELS :

# List of Models
model_list <-c()
accuracy_list<-c()
kappa_list<-c()

##Logestic regression :

LRmodel<-glm(readmitted~.,train_diabetic_largedata,family = "binomial")
summary(LRmodel)
## 
## Call:
## glm(formula = readmitted ~ ., family = "binomial", data = train_diabetic_largedata)
## 
## Coefficients: (5 not defined because of singularities)
##                             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                 0.685381   8.125013   0.084  0.93277   
## time_in_hospital            0.501433   0.233624   2.146  0.03185 * 
## num_lab_procedures         -0.608617   0.201751  -3.017  0.00256 **
## num_procedures             -0.139127   0.191683  -0.726  0.46795   
## num_medications             0.419535   0.252480   1.662  0.09658 . 
## number_inpatient            0.430593   0.249511   1.726  0.08439 . 
## number_diagnoses            0.057040   0.279653   0.204  0.83838   
## diag_circ                   0.080231   0.212306   0.378  0.70550   
## diag_resp                   0.194186   0.208264   0.932  0.35113   
## diag_dig                    0.113123   0.188087   0.601  0.54755   
## diag_diab                  -0.344623   0.201676  -1.709  0.08749 . 
## diag_inj                    0.044168   0.198233   0.223  0.82368   
## diag_geni                  -0.036827   0.173402  -0.212  0.83181   
## diag_other                 -0.079484   0.206787  -0.384  0.70070   
## raceAfricanAmerican        -0.527218   0.288128  -1.830  0.06728 . 
## raceCaucasian              -0.191791   0.338888  -0.566  0.57143   
## raceHispanic               -0.023366   0.281366  -0.083  0.93382   
## genderFemale                0.190843   0.186570   1.023  0.30635   
## genderMale                        NA         NA      NA       NA   
## agethird                   -0.628737   0.283992  -2.214  0.02683 * 
## ageforth                   -0.574444   0.384938  -1.492  0.13562   
## agefifth                   -0.812470   0.456424  -1.780  0.07506 . 
## agesixth                   -0.734031   0.465585  -1.577  0.11489   
## ageseventh                 -0.806224   0.461348  -1.748  0.08054 . 
## ageeighth                  -0.786368   0.426354  -1.844  0.06512 . 
## discharge_disposition_id1   0.237655   0.301199   0.789  0.43009   
## discharge_disposition_id2   0.410209   0.227801   1.801  0.07174 . 
## discharge_disposition_id3   0.388435   0.264536   1.468  0.14200   
## discharge_disposition_id6  -0.159235   0.246322  -0.646  0.51799   
## admission_type_id1          0.884149   0.559535   1.580  0.11407   
## admission_type_id6          0.557252   0.621723   0.896  0.37009   
## max_glu_serum200            0.055992   0.193346   0.290  0.77213   
## max_glu_serum300            0.074233   0.258567   0.287  0.77404   
## max_glu_serumNorm                 NA         NA      NA       NA   
## A1Cresult7                 -0.017327   0.232840  -0.074  0.94068   
## A1Cresult8                  0.006712   0.262899   0.026  0.97963   
## A1CresultNorm                     NA         NA      NA       NA   
## admission_source_id1       -4.142956 270.311284  -0.015  0.98777   
## admission_source_id7       -4.713139 281.540180  -0.017  0.98664   
## insulinDown                 0.052218   0.245427   0.213  0.83151   
## insulinNo                   0.104542   0.417200   0.251  0.80214   
## insulinSteady               0.195932   0.368150   0.532  0.59458   
## changeCh                    0.115412   0.265828   0.434  0.66417   
## changeNo                          NA         NA      NA       NA   
## diabetesMedNo               0.021794   0.247601   0.088  0.92986   
## diabetesMedYes                    NA         NA      NA       NA   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 311.41  on 230  degrees of freedom
## Residual deviance: 239.74  on 190  degrees of freedom
## AIC: 321.74
## 
## Number of Fisher Scoring iterations: 14
#Based un this model if we keep the variable's change, the y starts at 0.6 and for 1 increase in Time in hospital and number of lab procedures the probabality of outcome increase o.5 times and decrease 0.6 times respectively also 1 increase in age(third) decrease the outcome probabality by 60%
predictLRmodel<-predict(LRmodel,test_diabetic_largedata)
head(predictLRmodel)
##        461        824        962       1984       2309       5016 
## -0.8503202 -2.5802732 -1.4662611  0.8109774  0.3940459 -2.9377491
class_predictLRmodel<-ifelse(predictLRmodel>0.5,1,0)
plot(LRmodel)

str(class_predictLRmodel)
##  Named num [1:58] 0 0 0 1 0 0 1 1 1 0 ...
##  - attr(*, "names")= chr [1:58] "461" "824" "962" "1984" ...
str(test_diabetic_largedata$readmitted)
##  Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 1 1 ...
class(test_diabetic_largedata$readmitted)
## [1] "factor"
class(class_predictLRmodel)
## [1] "numeric"
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(caret)
LRmodelconfusionmatric<-confusionMatrix(test_diabetic_largedata$readmitted,factor(class_predictLRmodel))
LRmodelconfusionmatric
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 15  8
##          1 14 21
##                                           
##                Accuracy : 0.6207          
##                  95% CI : (0.4837, 0.7449)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.04347         
##                                           
##                   Kappa : 0.2414          
##                                           
##  Mcnemar's Test P-Value : 0.28642         
##                                           
##             Sensitivity : 0.5172          
##             Specificity : 0.7241          
##          Pos Pred Value : 0.6522          
##          Neg Pred Value : 0.6000          
##              Prevalence : 0.5000          
##          Detection Rate : 0.2586          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.6207          
##                                           
##        'Positive' Class : 0               
## 
LRmodelroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),predictLRmodel,plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

LRmodelroc
## 
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted,     ordered = T), predictor = predictLRmodel, plot = T, print.auc = TRUE)
## 
## Data: predictLRmodel in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.7043
model_list <-append(model_list,"LRmodel")
accuracy_list<-append(accuracy_list,LRmodelconfusionmatric$overall['Accuracy'])
kappa_list<-append(kappa_list,LRmodelconfusionmatric$overall['Kappa'])
#Logistic regression with just significant variables :
LRmodel2<-glm(readmitted~time_in_hospital+num_lab_procedures+agethird,train_diabetic_largedata,family = binomial)
summary(LRmodel2)
## 
## Call:
## glm(formula = readmitted ~ time_in_hospital + num_lab_procedures + 
##     agethird, family = binomial, data = train_diabetic_largedata)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          0.4262     0.1402   3.039 0.002375 ** 
## time_in_hospital     0.5210     0.1583   3.291 0.000997 ***
## num_lab_procedures  -0.3607     0.1504  -2.399 0.016451 *  
## agethird            -0.2365     0.1445  -1.637 0.101661    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 311.41  on 230  degrees of freedom
## Residual deviance: 294.66  on 227  degrees of freedom
## AIC: 302.66
## 
## Number of Fisher Scoring iterations: 4
predictLRmodel2<-predict(LRmodel2,test_diabetic_largedata)
head(predictLRmodel2)
##        461        824        962       1984       2309       5016 
##  1.0624947 -0.2841240 -0.1532191  0.8203332  0.4546378 -0.3732674
class_predictLRdimodel2<-ifelse(predictLRmodel2>0.5,1,0)

LRmodelconfusionmatric2<-confusionMatrix(test_diabetic_largedata$readmitted,factor(class_predictLRdimodel2))
LRmodelconfusionmatric2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 19  4
##          1 16 19
##                                           
##                Accuracy : 0.6552          
##                  95% CI : (0.5188, 0.7751)
##     No Information Rate : 0.6034          
##     P-Value [Acc > NIR] : 0.25292         
##                                           
##                   Kappa : 0.3387          
##                                           
##  Mcnemar's Test P-Value : 0.01391         
##                                           
##             Sensitivity : 0.5429          
##             Specificity : 0.8261          
##          Pos Pred Value : 0.8261          
##          Neg Pred Value : 0.5429          
##              Prevalence : 0.6034          
##          Detection Rate : 0.3276          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.6845          
##                                           
##        'Positive' Class : 0               
## 
LRmodel2roc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),predictLRmodel2,plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

LRmodel2roc 
## 
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted,     ordered = T), predictor = predictLRmodel2, plot = T, print.auc = TRUE)
## 
## Data: predictLRmodel2 in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.718
model_list <-append(model_list,"LRmodel2")
accuracy_list<-append(accuracy_list,LRmodelconfusionmatric2$overall['Accuracy'])
kappa_list<-append(kappa_list,LRmodelconfusionmatric2$overall['Kappa'])
#logestic regretion with the PCA data set
PCALRmodel3<-glm(readmitted~.,PCa_train_diabetic_largedata,family = "binomial")
summary(PCALRmodel3)
## 
## Call:
## glm(formula = readmitted ~ ., family = "binomial", data = PCa_train_diabetic_largedata)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.52383    0.15565   3.365 0.000764 ***
## PC1          0.12015    0.07021   1.711 0.087032 .  
## PC2         -0.15412    0.08192  -1.881 0.059928 .  
## PC3          0.31894    0.08963   3.558 0.000373 ***
## PC4         -0.08743    0.09650  -0.906 0.364932    
## PC5         -0.03482    0.09824  -0.354 0.723006    
## PC6          0.33425    0.11115   3.007 0.002635 ** 
## PC7          0.07124    0.11015   0.647 0.517781    
## PC8         -0.02535    0.11240  -0.226 0.821588    
## PC9          0.03506    0.11205   0.313 0.754345    
## PC10        -0.29494    0.12758  -2.312 0.020790 *  
## PC11        -0.10447    0.12222  -0.855 0.392675    
## PC12         0.11079    0.13021   0.851 0.394822    
## PC13        -0.30083    0.13580  -2.215 0.026748 *  
## PC14         0.21961    0.14505   1.514 0.130018    
## PC15         0.29881    0.14221   2.101 0.035623 *  
## PC16        -0.04865    0.14162  -0.344 0.731203    
## PC17         0.04412    0.13806   0.320 0.749271    
## PC18        -0.02303    0.14577  -0.158 0.874464    
## PC19        -0.04806    0.14882  -0.323 0.746746    
## PC20         0.27985    0.16594   1.686 0.091712 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 311.41  on 230  degrees of freedom
## Residual deviance: 265.02  on 210  degrees of freedom
## AIC: 307.02
## 
## Number of Fisher Scoring iterations: 4
predictLRmodel3<-predict(PCALRmodel3,PCa_test_diabetic_largedata)
head(predictLRmodel3)
##         461         824         962        1984        2309        5016 
## -0.05863418 -0.71263723 -1.19628013  0.51373357 -0.29253418 -1.36470207
class_predictLRmodel3<-ifelse(predictLRmodel3>0.5,1,0)
head(class_predictLRmodel3)
##  461  824  962 1984 2309 5016 
##    0    0    0    1    0    0
table(PCa_test_diabetic_largedata$readmitted)
## 
##  0  1 
## 23 35
PCALRmodelconfusionmatric<-confusionMatrix(factor(PCa_test_diabetic_largedata$readmitted,ordered=T),factor(class_predictLRmodel3,ordered=T))
PCALRmodelconfusionmatric
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 16  7
##          1 15 20
##                                           
##                Accuracy : 0.6207          
##                  95% CI : (0.4837, 0.7449)
##     No Information Rate : 0.5345          
##     P-Value [Acc > NIR] : 0.1177          
##                                           
##                   Kappa : 0.2521          
##                                           
##  Mcnemar's Test P-Value : 0.1356          
##                                           
##             Sensitivity : 0.5161          
##             Specificity : 0.7407          
##          Pos Pred Value : 0.6957          
##          Neg Pred Value : 0.5714          
##              Prevalence : 0.5345          
##          Detection Rate : 0.2759          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.6284          
##                                           
##        'Positive' Class : 0               
## 
LRmodel3roc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),predictLRmodel3,plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

LRmodel3roc 
## 
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted,     ordered = T), predictor = predictLRmodel3, plot = T, print.auc = TRUE)
## 
## Data: predictLRmodel3 in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.6522
model_list <-append(model_list,"PCALRmodel3")
accuracy_list<-append(accuracy_list,PCALRmodelconfusionmatric$overall['Accuracy'])
kappa_list<-append(kappa_list,PCALRmodelconfusionmatric$overall['Kappa'])

##Elastic Net

library(caret)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
ELmodel<-cv.glmnet(as.matrix(train_diabetic_largedata[,-46]), train_diabetic_largedata$readmitted, family = "binomial", alpha = 0.5)
ELpredict<-predict(ELmodel,as.matrix(test_diabetic_largedata[,-46]),type="response")
ELpredict_class<-ifelse(ELpredict>0.5,1,0)
mean(ELpredict_class==test_diabetic_largedata$readmitted)
## [1] 0.6034483
ELmodelroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),ELpredict,plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

PCaELmodel<-cv.glmnet(as.matrix(PCa_train_diabetic_largedata[,-1]), PCa_train_diabetic_largedata$readmitted, family = "binomial", alpha = 0.5)
plot(PCaELmodel)

PCaELpredict<-predict(PCaELmodel,as.matrix(PCa_test_diabetic_largedata[,-1]))
PCaELpredict_class<-ifelse(PCaELpredict>0.5,1,0)
table(PCa_test_diabetic_largedata$readmitted,PCaELpredict_class)
##    PCaELpredict_class
##      0  1
##   0 19  4
##   1 23 12
mean(PCaELpredict_class==PCa_test_diabetic_largedata$readmitted)
## [1] 0.5344828
roc(PCa_test_diabetic_largedata$readmitted,PCaELpredict,plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = PCa_test_diabetic_largedata$readmitted,     predictor = PCaELpredict, plot = T, print.auc = TRUE)
## 
## Data: PCaELpredict in 23 controls (PCa_test_diabetic_largedata$readmitted 0) < 35 cases (PCa_test_diabetic_largedata$readmitted 1).
## Area under the curve: 0.6522

##KNN:

library(class)
vec = c()
k_vec = c()
for (k in 1:50){
predictKNN= knn(train = train_diabetic_largedata[, -46],test = test_diabetic_largedata[, -46],cl = train_diabetic_largedata$readmitted,k = k)

error = mean(predictKNN != test_diabetic_largedata$readmitted)
k_vec = c(k_vec, k)
vec = c(vec, error)}
dataframeerror<-data.frame(k_vec,vec)
min_row <- subset(dataframeerror, vec == min(vec))
ggplot(dataframeerror,aes(x=k_vec,y=vec))+geom_line(color="red")+
  geom_hline(yintercept = min(dataframeerror$vec), linetype = "dashed") +annotate("text", x = min_row$k_vec, y = min_row$vec, label = min_row$k_vec, vjust = -1)+geom_point(data = min_row, aes(x = k_vec, y = vec), color = "blue", size = 3)

#k=17
predictKNN<-knn(train = train_diabetic_largedata[, -46],test = test_diabetic_largedata[, -46],cl = train_diabetic_largedata$readmitted,k = 17)
head(predictKNN)
## [1] 1 1 0 1 1 1
## Levels: 0 1
knnconfusionmatrix<-confusionMatrix(factor((test_diabetic_largedata$readmitted),ordered=T),factor(predictKNN,ordered=T))
knnconfusionmatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 17  6
##          1  7 28
##                                           
##                Accuracy : 0.7759          
##                  95% CI : (0.6473, 0.8749)
##     No Information Rate : 0.5862          
##     P-Value [Acc > NIR] : 0.001934        
##                                           
##                   Kappa : 0.5351          
##                                           
##  Mcnemar's Test P-Value : 1.000000        
##                                           
##             Sensitivity : 0.7083          
##             Specificity : 0.8235          
##          Pos Pred Value : 0.7391          
##          Neg Pred Value : 0.8000          
##              Prevalence : 0.4138          
##          Detection Rate : 0.2931          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.7659          
##                                           
##        'Positive' Class : 0               
## 
predictKNN.roc<-roc(test_diabetic_largedata$readmitted,factor(predictKNN,ordered=T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

predictKNN.roc
## 
## Call:
## roc.default(response = test_diabetic_largedata$readmitted, predictor = factor(predictKNN,     ordered = T), plot = T, print.auc = TRUE)
## 
## Data: factor(predictKNN, ordered = T) in 23 controls (test_diabetic_largedata$readmitted 0) < 35 cases (test_diabetic_largedata$readmitted 1).
## Area under the curve: 0.7696
model_list <-append(model_list,"predictKNN")
accuracy_list<-append(accuracy_list,knnconfusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,knnconfusionmatrix$overall['Kappa'])
#knn with PCA data
vec2 = c()
k_vec2 = c()
for (k in 1:50){
predictKNN= knn(train = PCa_train_diabetic_largedata[, -1],test = PCa_test_diabetic_largedata[, -1],cl = PCa_train_diabetic_largedata$readmitted,k = k)

error2 = mean(predictKNN != PCa_test_diabetic_largedata$readmitted)
k_vec2 = c(k_vec2, k)
vec2 = c(vec2, error2)}
dataframeerror2<-data.frame(k_vec2,vec2)
min_row2 <- subset(dataframeerror2, vec2 == min(vec2))
ggplot(dataframeerror2,aes(x=k_vec2,y=vec2))+geom_line(color="red")+
  geom_hline(yintercept = min(dataframeerror2$vec2), linetype = "dashed") +annotate("text", x = min_row2$k_vec2, y = min_row2$vec2, label = min_row2$k_vec2, vjust = -1)+geom_point(data = min_row2, aes(x = k_vec2, y = vec2), color = "blue", size = 3)

#k=24
PCApredictKNN<-knn(train = PCa_train_diabetic_largedata[, -1],test = PCa_test_diabetic_largedata[, -1],cl = PCa_train_diabetic_largedata$readmitted,k = 33)
head(PCApredictKNN)
## [1] 1 1 1 1 0 0
## Levels: 0 1
PCaKNNconfisionmatrix<-confusionMatrix(factor((PCa_test_diabetic_largedata$readmitted),ordered=T),factor(PCApredictKNN,ordered=T))
PCaKNNconfisionmatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 14  9
##          1  6 29
##                                           
##                Accuracy : 0.7414          
##                  95% CI : (0.6096, 0.8474)
##     No Information Rate : 0.6552          
##     P-Value [Acc > NIR] : 0.1052          
##                                           
##                   Kappa : 0.4473          
##                                           
##  Mcnemar's Test P-Value : 0.6056          
##                                           
##             Sensitivity : 0.7000          
##             Specificity : 0.7632          
##          Pos Pred Value : 0.6087          
##          Neg Pred Value : 0.8286          
##              Prevalence : 0.3448          
##          Detection Rate : 0.2414          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.7316          
##                                           
##        'Positive' Class : 0               
## 
PCApredictKNN.roc<-roc(PCa_test_diabetic_largedata$readmitted,factor(PCApredictKNN,ordered=T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

PCApredictKNN.roc 
## 
## Call:
## roc.default(response = PCa_test_diabetic_largedata$readmitted,     predictor = factor(PCApredictKNN, ordered = T), plot = T,     print.auc = TRUE)
## 
## Data: factor(PCApredictKNN, ordered = T) in 23 controls (PCa_test_diabetic_largedata$readmitted 0) < 35 cases (PCa_test_diabetic_largedata$readmitted 1).
## Area under the curve: 0.7186
model_list <-append(model_list,"PCApredictKNN")
accuracy_list<-append(accuracy_list,PCaKNNconfisionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,PCaKNNconfisionmatrix$overall['Kappa'])

##SVM model :

#svm radial
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:lessR':
## 
##     kurtosis
tune.svm.largediabet <- tune(svm,train.x=train_diabetic_largedata[, -46],train.y=train_diabetic_largedata[, 46],kernel='radial',ranges=list(cost=10^(-1:2), gamma=c(0.25,.5,1,2)))
tune.svm.largediabet
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##   0.1  0.25
## 
## - best performance: 0.4030797
SVMmodel1<-svm(formula = readmitted ~ .,data = train_diabetic_largedata,kernel = 'radial',type="C-classification",cost=0.1,gamma=0.25)
predictSVMmodel1<-predict(SVMmodel1,test_diabetic_largedata)

SVMmodel1.roc<-roc(test_diabetic_largedata$readmitted,factor(predictSVMmodel1,ordered=T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

SVMmodel1.roc 
## 
## Call:
## roc.default(response = test_diabetic_largedata$readmitted, predictor = factor(predictSVMmodel1,     ordered = T), plot = T, print.auc = TRUE)
## 
## Data: factor(predictSVMmodel1, ordered = T) in 23 controls (test_diabetic_largedata$readmitted 0) < 35 cases (test_diabetic_largedata$readmitted 1).
## Area under the curve: 0.5
table(test_diabetic_largedata$readmitted,predictSVMmodel1)
##    predictSVMmodel1
##      0  1
##   0  0 23
##   1  0 35
#all the predictions are 1 seems the SVM model overfitted the train ! I'll try cross validation later 
#SVM for PCA data
tune.svm.largediabet2 <- tune(svm,train.x=PCa_train_diabetic_largedata[, -1],train.y=PCa_train_diabetic_largedata[, 1],kernel='radial',ranges=list(cost=10^(-1:2), gamma=c(0.25,.5,1,2)))
tune.svm.largediabet2
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1  0.25
## 
## - best performance: 0.3856884
PCASVMmodel2<-svm(formula = readmitted ~ .,data = PCa_train_diabetic_largedata,kernel = 'radial',type="C-classification",cost=10,gamma=0.5)
predictSVMmodel2<-predict(PCASVMmodel2,PCa_test_diabetic_largedata)
table(factor(PCa_test_diabetic_largedata$readmitted),factor(predictSVMmodel2,ordered=T))
##    
##      1
##   0 23
##   1 35
SVMmodel2.roc<-roc(PCa_test_diabetic_largedata$readmitted,factor(predictSVMmodel2,ordered=T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

SVMmodel2.roc
## 
## Call:
## roc.default(response = PCa_test_diabetic_largedata$readmitted,     predictor = factor(predictSVMmodel2, ordered = T), plot = T,     print.auc = TRUE)
## 
## Data: factor(predictSVMmodel2, ordered = T) in 23 controls (PCa_test_diabetic_largedata$readmitted 0) < 35 cases (PCa_test_diabetic_largedata$readmitted 1).
## Area under the curve: 0.5

my SVM is working as good as flip a coin !!!!!

##Decession Tree

library(rpart)
DTmodel1<-rpart(readmitted ~ ., method='class',data = train_diabetic_largedata)
plot(DTmodel1, uniform=TRUE, main="Main tree")
text(DTmodel1, use.n=TRUE, all=TRUE)

DTpredict1<-predict(DTmodel1,test_diabetic_largedata[-46])
class_DTpredict1<-ifelse(DTpredict1[,"1"]>0.5,1,0)
DTconfusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered = T),factor(class_DTpredict1,ordered = T))
DTconfusionmatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0  7 16
##          1  6 29
##                                           
##                Accuracy : 0.6207          
##                  95% CI : (0.4837, 0.7449)
##     No Information Rate : 0.7759          
##     P-Value [Acc > NIR] : 0.99764         
##                                           
##                   Kappa : 0.1436          
##                                           
##  Mcnemar's Test P-Value : 0.05501         
##                                           
##             Sensitivity : 0.5385          
##             Specificity : 0.6444          
##          Pos Pred Value : 0.3043          
##          Neg Pred Value : 0.8286          
##              Prevalence : 0.2241          
##          Detection Rate : 0.1207          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.5915          
##                                           
##        'Positive' Class : 0               
## 
DTpredict1.roc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(class_DTpredict1,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

DTpredict1.roc
## 
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted,     ordered = T), predictor = factor(class_DTpredict1, ordered = T),     plot = T, print.auc = TRUE)
## 
## Data: factor(class_DTpredict1, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.5665
model_list <-append(model_list,"DTmodel1")
accuracy_list<-append(accuracy_list,DTconfusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,DTconfusionmatrix$overall['Kappa'])
#Decision Tree in PCA data
DTmodel2<-rpart(readmitted ~ ., method='class',data = PCa_train_diabetic_largedata)
plot(DTmodel2, uniform=TRUE, main="Main tree")
text(DTmodel2, use.n=TRUE, all=TRUE)

DTpredict2<-predict(DTmodel2,PCa_test_diabetic_largedata[-1])
class_DTpredict2<-ifelse(DTpredict2[,"1"]>0.5,1,0)
confusionMatrix(factor(PCa_test_diabetic_largedata$readmitted,ordered = T),factor(class_DTpredict2,ordered = T))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0  9 14
##          1 12 23
##                                           
##                Accuracy : 0.5517          
##                  95% CI : (0.4154, 0.6826)
##     No Information Rate : 0.6379          
##     P-Value [Acc > NIR] : 0.9319          
##                                           
##                   Kappa : 0.0492          
##                                           
##  Mcnemar's Test P-Value : 0.8445          
##                                           
##             Sensitivity : 0.4286          
##             Specificity : 0.6216          
##          Pos Pred Value : 0.3913          
##          Neg Pred Value : 0.6571          
##              Prevalence : 0.3621          
##          Detection Rate : 0.1552          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.5251          
##                                           
##        'Positive' Class : 0               
## 
DTpredict2.roc<-roc(factor(PCa_test_diabetic_largedata$readmitted,ordered=T),factor(class_DTpredict2,ordered = T),plot=T,print.auc=T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

DTpredict2.roc
## 
## Call:
## roc.default(response = factor(PCa_test_diabetic_largedata$readmitted,     ordered = T), predictor = factor(class_DTpredict2, ordered = T),     plot = T, print.auc = T)
## 
## Data: factor(class_DTpredict2, ordered = T) in 23 controls (factor(PCa_test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(PCa_test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.5242

##Random Forest

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
RFmodel1<-randomForest(readmitted ~ ., method='class',data =train_diabetic_largedata)
importance(RFmodel1)
##                           MeanDecreaseGini
## time_in_hospital                 8.1818596
## num_lab_procedures              11.6281479
## num_procedures                   3.4827573
## num_medications                 13.0492024
## number_inpatient                 5.9557078
## number_diagnoses                 3.7367889
## diag_circ                        2.0734962
## diag_resp                        2.2239198
## diag_dig                         1.0200692
## diag_diab                        2.3612038
## diag_inj                         1.2613006
## diag_geni                        1.3350263
## diag_other                       2.4821827
## raceAfricanAmerican              2.1710502
## raceCaucasian                    2.2610999
## raceHispanic                     1.1082423
## genderFemale                     2.3348926
## genderMale                       2.6737351
## agethird                         1.2403264
## ageforth                         1.0953051
## agefifth                         1.5149810
## agesixth                         1.4820269
## ageseventh                       1.6191520
## ageeighth                        1.4251049
## discharge_disposition_id1        2.1203363
## discharge_disposition_id2        1.0409082
## discharge_disposition_id3        1.5798936
## discharge_disposition_id6        1.6153082
## admission_type_id1               1.2399576
## admission_type_id6               1.2141682
## max_glu_serum200                 1.7696785
## max_glu_serum300                 1.3349874
## max_glu_serumNorm                1.6756513
## A1Cresult7                       1.4890854
## A1Cresult8                       1.7376547
## A1CresultNorm                    1.7568771
## admission_source_id1             1.0217697
## admission_source_id7             1.1240186
## insulinDown                      0.7181179
## insulinNo                        1.4234548
## insulinSteady                    1.2803110
## changeCh                         1.3610448
## changeNo                         1.5414399
## diabetesMedNo                    1.7117182
## diabetesMedYes                   1.6760362
#based on this model number of medications has the most impact on the readmitted or not readmitted patient, after that time in hospital and number of lab_procedures had impact on readmitted or not readmitted patients
RFpredict1<-predict(RFmodel1,test_diabetic_largedata)
#class_RFpredict1<-ifelse(RFpredict1>0.5,1,0)
RFconfusionmatrix<-confusionMatrix(test_diabetic_largedata$readmitted,factor(RFpredict1,ordered = T))
RFconfusionmatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 14  9
##          1  7 28
##                                          
##                Accuracy : 0.7241         
##                  95% CI : (0.591, 0.8334)
##     No Information Rate : 0.6379         
##     P-Value [Acc > NIR] : 0.1080         
##                                          
##                   Kappa : 0.4149         
##                                          
##  Mcnemar's Test P-Value : 0.8026         
##                                          
##             Sensitivity : 0.6667         
##             Specificity : 0.7568         
##          Pos Pred Value : 0.6087         
##          Neg Pred Value : 0.8000         
##              Prevalence : 0.3621         
##          Detection Rate : 0.2414         
##    Detection Prevalence : 0.3966         
##       Balanced Accuracy : 0.7117         
##                                          
##        'Positive' Class : 0              
## 
RFpredict1.roc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(RFpredict1,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

RFpredict1.roc
## 
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted,     ordered = T), predictor = factor(RFpredict1, ordered = T),     plot = T, print.auc = TRUE)
## 
## Data: factor(RFpredict1, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.7043
model_list <-append(model_list,"RFmodel1")
accuracy_list<-append(accuracy_list,RFconfusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,RFconfusionmatrix$overall['Kappa'])
#Rndom Forest with PCA
PCARFmodel2<-randomForest(readmitted ~ ., method='class',data =PCa_train_diabetic_largedata)
importance(PCARFmodel2)
##      MeanDecreaseGini
## PC1          5.754521
## PC2          5.820856
## PC3         11.285097
## PC4          4.868472
## PC5          4.822118
## PC6          6.354644
## PC7          5.300672
## PC8          4.262475
## PC9          4.588260
## PC10         6.441359
## PC11         5.251960
## PC12         4.174113
## PC13         6.039580
## PC14         4.834762
## PC15         5.107902
## PC16         4.812358
## PC17         5.614285
## PC18         4.525770
## PC19         5.043179
## PC20         5.605877
PCaRFpredict2<-predict(PCARFmodel2,PCa_test_diabetic_largedata)
#PCaRFconfusionmatrix<-confusionMatrix(factor(PCa_test_diabetic_largedata$readmitted,ordered=T),factor(PCaRFpredict2,ordered = T))
#PCaRFconfusionmatrix
RFpredict2.roc<-roc(factor(PCa_test_diabetic_largedata$readmitted,ordered=T),factor(PCaRFpredict2,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

RFpredict2.roc
## 
## Call:
## roc.default(response = factor(PCa_test_diabetic_largedata$readmitted,     ordered = T), predictor = factor(PCaRFpredict2, ordered = T),     plot = T, print.auc = TRUE)
## 
## Data: factor(PCaRFpredict2, ordered = T) in 23 controls (factor(PCa_test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(PCa_test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.7329
#model_list <-append(model_list,"PCARFDTmodel2")
#accuracy_list<-append(accuracy_list,PCaRFconfusionmatrix$overall['Accuracy'])
#kappa_list<-append(kappa_list,PCaRFconfusionmatrix$overall['Kappa'])
head(test_diabetic_largedata)
##      time_in_hospital num_lab_procedures num_procedures num_medications
## 461         1.4896652          0.5474001      0.1168388       0.5768653
## 824         0.5055489          2.8592177      1.7044716       0.1766975
## 962        -1.1346449          0.1270696     -0.6769776      -1.5573632
## 1984       -0.4785674         -1.6243074      0.1168388      -0.8904168
## 2309        0.1775101          0.3372348     -0.6769776       0.3100868
## 5016       -0.1505286          2.1586669      0.9106552      -0.4902489
##      number_inpatient number_diagnoses  diag_circ  diag_resp   diag_dig
## 461        -0.5275243       -0.6526203 -1.0835776 -0.6647022 -0.3705859
## 824        -0.5275243       -0.6526203  0.9188738 -0.6647022 -0.3705859
## 962        -0.5275243       -1.9867378  0.9188738 -0.6647022 -0.3705859
## 1984       -0.5275243       -1.9867378 -1.0835776  1.4979204 -0.3705859
## 2309        0.2637622       -0.6526203  0.9188738 -0.6647022 -0.3705859
## 5016       -0.5275243       -0.6526203  0.9188738 -0.6647022 -0.3705859
##       diag_diab  diag_inj  diag_geni diag_other raceAfricanAmerican
## 461   0.7684633 -0.281239 -0.4357706  0.9029688           2.4441821
## 824   0.7684633 -0.281239 -0.4357706  0.9029688          -0.4073637
## 962   0.7684633 -0.281239 -0.4357706  0.9029688          -0.4073637
## 1984  0.7684633 -0.281239 -0.4357706 -1.1026638          -0.4073637
## 2309 -1.2956649 -0.281239 -0.4357706 -1.1026638          -0.4073637
## 5016  0.7684633 -0.281239  2.2848512 -1.1026638           2.4441821
##      raceCaucasian raceHispanic genderFemale genderMale   agethird   ageforth
## 461     -1.3975146   -0.3928473    0.8565351 -0.8565351 -0.2436696 -0.4001349
## 824      0.7124584   -0.3928473   -1.1624404  1.1624404 -0.2436696 -0.4001349
## 962     -1.3975146    2.5344987    0.8565351 -0.8565351 -0.2436696 -0.4001349
## 1984     0.7124584   -0.3928473   -1.1624404  1.1624404 -0.2436696 -0.4001349
## 2309    -1.3975146    2.5344987    0.8565351 -0.8565351 -0.2436696 -0.4001349
## 5016    -1.3975146   -0.3928473    0.8565351 -0.8565351 -0.2436696 -0.4001349
##        agefifth   agesixth ageseventh  ageeighth discharge_disposition_id1
## 461  -0.5043104 -0.5043104  1.8985070 -0.4216615                 0.7402386
## 824  -0.5043104 -0.5043104 -0.5244495  2.3613042                 0.7402386
## 962   1.9743217 -0.5043104 -0.5244495 -0.4216615                 0.7402386
## 1984  1.9743217 -0.5043104 -0.5244495 -0.4216615                -1.3450677
## 2309 -0.5043104 -0.5043104  1.8985070 -0.4216615                -1.3450677
## 5016  1.9743217 -0.5043104 -0.5244495 -0.4216615                 0.7402386
##      discharge_disposition_id2 discharge_disposition_id3
## 461                 -0.2629521                -0.3630151
## 824                 -0.2629521                -0.3630151
## 962                 -0.2629521                -0.3630151
## 1984                -0.2629521                -0.3630151
## 2309                -0.2629521                -0.3630151
## 5016                -0.2629521                -0.3630151
##      discharge_disposition_id6 admission_type_id1 admission_type_id6
## 461                 -0.3397648         -0.4703831          0.5110378
## 824                 -0.3397648         -0.4703831          0.5110378
## 962                 -0.3397648         -0.4703831          0.5110378
## 1984                -0.3397648         -0.4703831          0.5110378
## 2309                 2.9304714         -0.4703831          0.5110378
## 5016                -0.3397648         -0.4703831          0.5110378
##      max_glu_serum200 max_glu_serum300 max_glu_serumNorm A1Cresult7 A1Cresult8
## 461        -0.5644597        1.1832846        -0.7193656 -0.5511479  0.8414468
## 824        -0.5644597        1.1832846        -0.7193656  1.8065404 -1.1832846
## 962        -0.5644597       -0.8414468         1.3840959  1.8065404 -1.1832846
## 1984        1.7639365       -0.8414468        -0.7193656 -0.5511479  0.8414468
## 2309        1.7639365       -0.8414468        -0.7193656 -0.5511479 -1.1832846
## 5016       -0.5644597       -0.8414468         1.3840959 -0.5511479  0.8414468
##      A1CresultNorm admission_source_id1 admission_source_id7 insulinDown
## 461     -0.4703831           -0.3155425            0.3318105  -0.2534499
## 824     -0.4703831           -0.3155425            0.3318105  -0.2534499
## 962     -0.4703831           -0.3155425            0.3318105  -0.2534499
## 1984    -0.4703831            3.1554255           -3.0007214  -0.2534499
## 2309     2.1167238           -0.3155425            0.3318105  -0.2534499
## 5016    -0.4703831           -0.3155425            0.3318105  -0.2534499
##       insulinNo insulinSteady   changeCh   changeNo diabetesMedNo
## 461  -1.5946130    -0.4497173  1.6118558 -1.6118558    -0.8641489
## 824   0.6243966    -0.4497173 -0.6177172  0.6177172    -0.8641489
## 962   0.6243966    -0.4497173 -0.6177172  0.6177172    -0.8641489
## 1984  0.6243966    -0.4497173 -0.6177172  0.6177172    -0.8641489
## 2309  0.6243966    -0.4497173 -0.6177172  0.6177172     1.1521985
## 5016 -1.5946130     2.2139930 -0.6177172  0.6177172    -0.8641489
##      diabetesMedYes readmitted
## 461       0.8641489          1
## 824       0.8641489          1
## 962       0.8641489          1
## 1984      0.8641489          1
## 2309     -1.1521985          1
## 5016      0.8641489          1
test_diabetic_largedata$readmitted<-as.numeric(test_diabetic_largedata$readmitted)
train_diabetic_largedata$readmitted<-as.numeric(train_diabetic_largedata$readmitted)
train_diabetic_largedata$readmitted<-ifelse(train_diabetic_largedata$readmitted==1,0,1)
test_diabetic_largedata$readmitted<-ifelse(test_diabetic_largedata$readmitted==1,0,1)
str(test_diabetic_largedata$readmitted)
##  num [1:58] 1 1 1 1 1 1 1 1 0 0 ...

##XGboost

library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
XGboostclassifier <- xgboost(data = as.matrix(train_diabetic_largedata[-46]), 
                              label = train_diabetic_largedata$readmitted, 
                             nrounds = 10, 
                              max_depth = 6, 
                              eta = 0.3, 
                              gamma = 0.5, 
                              subsample = 0.8, 
                              colsample_bytree = 0.8, 
                              min_child_weight = 1)
## [1]  train-rmse:0.443372 
## [2]  train-rmse:0.400061 
## [3]  train-rmse:0.364136 
## [4]  train-rmse:0.333161 
## [5]  train-rmse:0.316839 
## [6]  train-rmse:0.302918 
## [7]  train-rmse:0.293847 
## [8]  train-rmse:0.293799 
## [9]  train-rmse:0.293774 
## [10] train-rmse:0.287219
XGboost_predict <- predict(XGboostclassifier, newdata = as.matrix(test_diabetic_largedata[-46]))
head(XGboost_predict)
## [1] 0.4178621 0.2111195 0.1282913 0.7103968 0.7638208 0.1316332
XGboost_predict<-ifelse(XGboost_predict>=0.5,1,0)
table(test_diabetic_largedata$readmitted,XGboost_predict)
##    XGboost_predict
##      0  1
##   0  9 14
##   1 13 22
imp_matrix_XGboos<-xgb.importance(model=XGboostclassifier)
xgb.plot.importance(imp_matrix_XGboos)

xgboostconfusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered=T),factor(XGboost_predict,ordered = T))
xgboostconfusionmatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0  9 14
##          1 13 22
##                                           
##                Accuracy : 0.5345          
##                  95% CI : (0.3987, 0.6666)
##     No Information Rate : 0.6207          
##     P-Value [Acc > NIR] : 0.9303          
##                                           
##                   Kappa : 0.02            
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.4091          
##             Specificity : 0.6111          
##          Pos Pred Value : 0.3913          
##          Neg Pred Value : 0.6286          
##              Prevalence : 0.3793          
##          Detection Rate : 0.1552          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.5101          
##                                           
##        'Positive' Class : 0               
## 
xgboost.roc<-roc(factor(PCa_test_diabetic_largedata$readmitted,ordered=T),factor(XGboost_predict,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

xgboost.roc
## 
## Call:
## roc.default(response = factor(PCa_test_diabetic_largedata$readmitted,     ordered = T), predictor = factor(XGboost_predict, ordered = T),     plot = T, print.auc = TRUE)
## 
## Data: factor(XGboost_predict, ordered = T) in 23 controls (factor(PCa_test_diabetic_largedata$readmitted, ordered = T) 0) < 35 cases (factor(PCa_test_diabetic_largedata$readmitted, ordered = T) 1).
## Area under the curve: 0.5099
model_list <-append(model_list,"XGboostclassifier")
accuracy_list<-append(accuracy_list,xgboostconfusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,xgboostconfusionmatrix$overall['Kappa'])
PCa_test_diabetic_largedata$readmitted<-as.numeric(PCa_test_diabetic_largedata$readmitted)
PCa_train_diabetic_largedata$readmitted<-as.numeric(PCa_train_diabetic_largedata$readmitted)
PCa_train_diabetic_largedata$readmitted<-ifelse(PCa_train_diabetic_largedata$readmitted==1,0,1)
PCa_test_diabetic_largedata$readmitted<-ifelse(PCa_test_diabetic_largedata$readmitted==1,0,1)
#XGboost by PCA data
XGboostclassifier2<- xgboost(data = as.matrix(PCa_train_diabetic_largedata[-1]), label = PCa_train_diabetic_largedata$readmitted, nrounds = 10)
## [1]  train-rmse:0.396027 
## [2]  train-rmse:0.320324 
## [3]  train-rmse:0.260482 
## [4]  train-rmse:0.219760 
## [5]  train-rmse:0.195537 
## [6]  train-rmse:0.174610 
## [7]  train-rmse:0.158761 
## [8]  train-rmse:0.139501 
## [9]  train-rmse:0.128810 
## [10] train-rmse:0.112929
XGboost_predict2<- predict(XGboostclassifier2, newdata = as.matrix(PCa_test_diabetic_largedata[-1]))
head(XGboost_predict2)
## [1] 0.7526193 0.8681863 0.4298209 0.7602748 0.1396990 0.3438241
XGboost_predict2<-ifelse(XGboost_predict2>=0.5,1,0)
table(PCa_test_diabetic_largedata$readmitted,XGboost_predict2)
##    XGboost_predict2
##      0  1
##   0 11 12
##   1 10 25
#confusionMatrix(factor(PCa_test_diabetic_largedata$readmitted,ordered=T),factor(XGboost_predict2,ordered = T))
imp_matrix_XGboos2<-xgb.importance(model=XGboostclassifier2)
xgb.plot.importance(imp_matrix_XGboos2)

###Cross Validation

table(test_diabetic_largedata$readmitted)
## 
##  0  1 
## 23 35
table(train_diabetic_largedata$readmitted)
## 
##   0   1 
##  93 138
train_diabetic_largedata$readmitted<-ifelse(train_diabetic_largedata$readmitted=="0","NO","YES")
test_diabetic_largedata$readmitted<-ifelse(test_diabetic_largedata$readmitted=="0","NO","YES")
# Define trainControl object
TRC <- trainControl(method = "cv",number = 5,classProbs = TRUE,summaryFunction = twoClassSummary)

##Glm Cross validation

# Train glm model with cross-validation
GLMmodelCV <- train(readmitted ~ .,data = train_diabetic_largedata,method = "glm",metric = "ROC",trControl = TRC)
print(GLMmodelCV)
## Generalized Linear Model 
## 
## 231 samples
##  45 predictor
##   2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 185, 185, 184, 185, 185 
## Resampling results:
## 
##   ROC        Sens       Spec    
##   0.6325327  0.5590643  0.673545
GLMmodelCV$results
##   parameter       ROC      Sens     Spec      ROCSD     SensSD     SpecSD
## 1      none 0.6325327 0.5590643 0.673545 0.02591945 0.09703022 0.06960111
GLMmodelCVpredict<-predict(GLMmodelCV,test_diabetic_largedata)
GLMmodelCVpredict
##  [1] NO  NO  NO  YES YES NO  YES YES YES NO  YES NO  YES NO  NO  NO  NO  NO  YES
## [20] YES YES NO  YES NO  YES YES YES NO  YES NO  YES YES YES NO  NO  NO  YES NO 
## [39] YES NO  YES YES NO  NO  YES NO  YES NO  NO  YES YES YES YES NO  YES YES YES
## [58] YES
## Levels: NO YES
GLMmodelCVconfusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered = T),factor(GLMmodelCVpredict,ordered=T))
GLMmodelCVconfusionmatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NO YES
##        NO  14   9
##        YES 12  23
##                                           
##                Accuracy : 0.6379          
##                  95% CI : (0.5012, 0.7601)
##     No Information Rate : 0.5517          
##     P-Value [Acc > NIR] : 0.1169          
##                                           
##                   Kappa : 0.26            
##                                           
##  Mcnemar's Test P-Value : 0.6625          
##                                           
##             Sensitivity : 0.5385          
##             Specificity : 0.7188          
##          Pos Pred Value : 0.6087          
##          Neg Pred Value : 0.6571          
##              Prevalence : 0.4483          
##          Detection Rate : 0.2414          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.6286          
##                                           
##        'Positive' Class : NO              
## 
GLMmodelCVroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(GLMmodelCVpredict,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = NO, case = YES
## Setting direction: controls < cases

GLMmodelCVroc
## 
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted,     ordered = T), predictor = factor(GLMmodelCVpredict, ordered = T),     plot = T, print.auc = TRUE)
## 
## Data: factor(GLMmodelCVpredict, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) NO) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) YES).
## Area under the curve: 0.6329
model_list <-append(model_list,"GLMmodelCV")
accuracy_list<-append(accuracy_list,GLMmodelCVconfusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,GLMmodelCVconfusionmatrix$overall['Kappa'])

##Random Forest Cross validation

RFmodelcv <- train(readmitted~., data = train_diabetic_largedata, method = "rf",metric = "ROC",trControl = TRC) 
RFmodelcv
## Random Forest 
## 
## 231 samples
##  45 predictor
##   2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 185, 185, 184, 185, 185 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    2    0.6399506  0.2941520  0.8558201
##   23    0.6609580  0.4438596  0.7915344
##   45    0.6559802  0.4228070  0.7550265
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 23.
RFmodelcvpredict<-predict(RFmodelcv,test_diabetic_largedata)
RFmodelcvpredict
##  [1] YES NO  NO  YES YES NO  YES YES YES NO  YES NO  YES YES NO  NO  NO  NO  YES
## [20] YES YES YES YES YES YES YES YES NO  YES YES YES YES YES YES NO  YES YES NO 
## [39] YES YES YES YES NO  NO  YES NO  YES NO  NO  YES YES NO  YES YES YES YES YES
## [58] YES
## Levels: NO YES
RFmodelcvconfusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered = T),factor(RFmodelcvpredict,ordered=T))
RFmodelcvconfusionmatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NO YES
##        NO  11  12
##        YES  7  28
##                                           
##                Accuracy : 0.6724          
##                  95% CI : (0.5366, 0.7899)
##     No Information Rate : 0.6897          
##     P-Value [Acc > NIR] : 0.6700          
##                                           
##                   Kappa : 0.289           
##                                           
##  Mcnemar's Test P-Value : 0.3588          
##                                           
##             Sensitivity : 0.6111          
##             Specificity : 0.7000          
##          Pos Pred Value : 0.4783          
##          Neg Pred Value : 0.8000          
##              Prevalence : 0.3103          
##          Detection Rate : 0.1897          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.6556          
##                                           
##        'Positive' Class : NO              
## 
RFMmodelCVroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(RFmodelcvpredict,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = NO, case = YES
## Setting direction: controls < cases

RFMmodelCVroc
## 
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted,     ordered = T), predictor = factor(RFmodelcvpredict, ordered = T),     plot = T, print.auc = TRUE)
## 
## Data: factor(RFmodelcvpredict, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) NO) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) YES).
## Area under the curve: 0.6391
model_list <-append(model_list,"RFmodelcv")
accuracy_list<-append(accuracy_list,RFmodelcvconfusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,RFmodelcvconfusionmatrix$overall['Kappa'])

##knn Cross validation

k_values <- seq(1, 20, by = 1)
KNNmodelCV <- train(
  readmitted ~ ., 
  data = train_diabetic_largedata, 
  method = "knn", 
  metric = "ROC", 
  trControl = TRC, 
  tuneGrid = expand.grid(k = k_values)
)
KNNmodelCV
## k-Nearest Neighbors 
## 
## 231 samples
##  45 predictor
##   2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 185, 185, 184, 185, 185 
## Resampling results across tuning parameters:
## 
##   k   ROC        Sens       Spec     
##    1  0.5235589  0.4836257  0.5634921
##    2  0.4878968  0.4304094  0.5211640
##    3  0.5252158  0.4730994  0.6216931
##    4  0.5505013  0.4502924  0.6216931
##    5  0.5530528  0.4619883  0.6076720
##    6  0.5648914  0.3976608  0.6148148
##    7  0.5776977  0.4520468  0.6148148
##    8  0.5761557  0.4526316  0.6582011
##    9  0.5786097  0.4614035  0.6513228
##   10  0.5725459  0.4087719  0.6150794
##   11  0.5530006  0.3654971  0.6150794
##   12  0.5516674  0.3748538  0.6074074
##   13  0.5700675  0.3760234  0.6296296
##   14  0.5762879  0.3748538  0.6447090
##   15  0.5734510  0.3760234  0.6444444
##   16  0.5828216  0.3766082  0.6801587
##   17  0.5804407  0.4087719  0.6804233
##   18  0.5970934  0.4087719  0.6656085
##   19  0.6058793  0.3877193  0.6878307
##   20  0.6138123  0.3994152  0.6952381
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 20.
KNNmodelcvpredict<-predict(KNNmodelCV,test_diabetic_largedata)
KNNpredictCVconfiusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered = T),factor(KNNmodelcvpredict,ordered=T))
KNNpredictCVconfiusionmatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NO YES
##        NO  17   6
##        YES  7  28
##                                           
##                Accuracy : 0.7759          
##                  95% CI : (0.6473, 0.8749)
##     No Information Rate : 0.5862          
##     P-Value [Acc > NIR] : 0.001934        
##                                           
##                   Kappa : 0.5351          
##                                           
##  Mcnemar's Test P-Value : 1.000000        
##                                           
##             Sensitivity : 0.7083          
##             Specificity : 0.8235          
##          Pos Pred Value : 0.7391          
##          Neg Pred Value : 0.8000          
##              Prevalence : 0.4138          
##          Detection Rate : 0.2931          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.7659          
##                                           
##        'Positive' Class : NO              
## 
KNNpredictCVroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(KNNmodelcvpredict,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = NO, case = YES
## Setting direction: controls < cases

KNNpredictCVroc
## 
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted,     ordered = T), predictor = factor(KNNmodelcvpredict, ordered = T),     plot = T, print.auc = TRUE)
## 
## Data: factor(KNNmodelcvpredict, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) NO) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) YES).
## Area under the curve: 0.7696
model_list <-append(model_list,"KNNmodelCVF")
accuracy_list<-append(accuracy_list,KNNpredictCVconfiusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,KNNpredictCVconfiusionmatrix$overall['Kappa'])

##SVM Cross Validation

SVMmodelCV <- train(readmitted ~ ., data = train_diabetic_largedata, method = "svmRadial", tuneLength = 5, preProc = c("center", "scale"), metric = "ROC", trControl = TRC)
SVMmodelCV
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 231 samples
##  45 predictor
##   2 classes: 'NO', 'YES' 
## 
## Pre-processing: centered (45), scaled (45) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 185, 185, 185, 185, 184 
## Resampling results across tuning parameters:
## 
##   C     ROC        Sens        Spec     
##   0.25  0.6189223  0.32222222  0.7677249
##   0.50  0.6185324  0.35380117  0.8034392
##   1.00  0.5076998  0.14853801  0.8838624
##   2.00  0.5877471  0.24795322  0.8259259
##   4.00  0.5568713  0.05321637  0.9412698
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01274113
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01274113 and C = 0.25.
SVMmodelcvpredict<-predict(SVMmodelCV,test_diabetic_largedata)
SVMpredictCVconfiusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered = T),factor(SVMmodelcvpredict,ordered=T))
SVMpredictCVconfiusionmatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NO YES
##        NO   9  14
##        YES  5  30
##                                           
##                Accuracy : 0.6724          
##                  95% CI : (0.5366, 0.7899)
##     No Information Rate : 0.7586          
##     P-Value [Acc > NIR] : 0.95038         
##                                           
##                   Kappa : 0.2663          
##                                           
##  Mcnemar's Test P-Value : 0.06646         
##                                           
##             Sensitivity : 0.6429          
##             Specificity : 0.6818          
##          Pos Pred Value : 0.3913          
##          Neg Pred Value : 0.8571          
##              Prevalence : 0.2414          
##          Detection Rate : 0.1552          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.6623          
##                                           
##        'Positive' Class : NO              
## 
SVMmodelCVroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(SVMmodelcvpredict,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = NO, case = YES
## Setting direction: controls < cases

SVMmodelCVroc
## 
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted,     ordered = T), predictor = factor(SVMmodelcvpredict, ordered = T),     plot = T, print.auc = TRUE)
## 
## Data: factor(SVMmodelcvpredict, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) NO) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) YES).
## Area under the curve: 0.6242
model_list <-append(model_list,"SVMmodelCV")
accuracy_list<-append(accuracy_list,SVMpredictCVconfiusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,SVMpredictCVconfiusionmatrix$overall['Kappa'])

#XGboost cross validation

XGmodelCV <- train(readmitted ~ .,data = train_diabetic_largedata,method = "xgbTree",metric = "ROC",trControl = TRC,tuneGrid=expand.grid(nrounds = 10, 
                              max_depth = 6, 
                              eta = 0.3, 
                              gamma = 0.5, 
                              subsample = 0.8, 
                              colsample_bytree = 0.8, 
                              min_child_weight = 1))
XGmodelCV$bestTune
##   nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 1      10         6 0.3   0.5              0.8                1       0.8
XGmodelcvpredict<-predict(XGmodelCV,test_diabetic_largedata)
XGpredictCVconfiusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered = T),factor(XGmodelcvpredict,ordered=T))
XGpredictCVconfiusionmatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NO YES
##        NO  12  11
##        YES 12  23
##                                           
##                Accuracy : 0.6034          
##                  95% CI : (0.4664, 0.7295)
##     No Information Rate : 0.5862          
##     P-Value [Acc > NIR] : 0.4501          
##                                           
##                   Kappa : 0.1776          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.5000          
##             Specificity : 0.6765          
##          Pos Pred Value : 0.5217          
##          Neg Pred Value : 0.6571          
##              Prevalence : 0.4138          
##          Detection Rate : 0.2069          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.5882          
##                                           
##        'Positive' Class : NO              
## 
XGmodelCVroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(XGmodelcvpredict,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = NO, case = YES
## Setting direction: controls < cases

XGmodelCVroc
## 
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted,     ordered = T), predictor = factor(XGmodelcvpredict, ordered = T),     plot = T, print.auc = TRUE)
## 
## Data: factor(XGmodelcvpredict, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) NO) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) YES).
## Area under the curve: 0.5894
model_list <-append(model_list,"XGmodelCV")
accuracy_list<-append(accuracy_list,XGpredictCVconfiusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,XGpredictCVconfiusionmatrix$overall['Kappa'])

#Elastic Net Cross Validation

ENmodelcv <- train(readmitted ~ .,data = train_diabetic_largedata,method = "glmnet", metric = "ROC", trControl = TRC)
ENmodelcv
## glmnet 
## 
## 231 samples
##  45 predictor
##   2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 186, 185, 184, 184, 185 
## Resampling results across tuning parameters:
## 
##   alpha  lambda        ROC        Sens       Spec     
##   0.10   0.0002334386  0.6079644  0.4830409  0.6812169
##   0.10   0.0023343864  0.6188813  0.4836257  0.6812169
##   0.10   0.0233438645  0.6325699  0.4497076  0.7103175
##   0.55   0.0002334386  0.6091773  0.4941520  0.6812169
##   0.55   0.0023343864  0.6141511  0.4941520  0.6666667
##   0.55   0.0233438645  0.6535389  0.4491228  0.7759259
##   1.00   0.0002334386  0.6103762  0.4941520  0.6740741
##   1.00   0.0023343864  0.6137110  0.4614035  0.6738095
##   1.00   0.0233438645  0.6561945  0.4385965  0.7976190
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda = 0.02334386.
ENmodelcvpredict<-predict(ENmodelcv,test_diabetic_largedata)
ENpredictCVconfiusionmatrix<-confusionMatrix(factor(test_diabetic_largedata$readmitted,ordered = T),factor(ENmodelcvpredict,ordered=T))
ENpredictCVconfiusionmatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NO YES
##        NO  14   9
##        YES  9  26
##                                           
##                Accuracy : 0.6897          
##                  95% CI : (0.5546, 0.8046)
##     No Information Rate : 0.6034          
##     P-Value [Acc > NIR] : 0.1125          
##                                           
##                   Kappa : 0.3516          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.6087          
##             Specificity : 0.7429          
##          Pos Pred Value : 0.6087          
##          Neg Pred Value : 0.7429          
##              Prevalence : 0.3966          
##          Detection Rate : 0.2414          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.6758          
##                                           
##        'Positive' Class : NO              
## 
ENmodelCVroc<-roc(factor(test_diabetic_largedata$readmitted,ordered=T),factor(ENmodelcvpredict,ordered = T),plot=T,print.auc=TRUE)
## Setting levels: control = NO, case = YES
## Setting direction: controls < cases

ENmodelCVroc
## 
## Call:
## roc.default(response = factor(test_diabetic_largedata$readmitted,     ordered = T), predictor = factor(ENmodelcvpredict, ordered = T),     plot = T, print.auc = TRUE)
## 
## Data: factor(ENmodelcvpredict, ordered = T) in 23 controls (factor(test_diabetic_largedata$readmitted, ordered = T) NO) < 35 cases (factor(test_diabetic_largedata$readmitted, ordered = T) YES).
## Area under the curve: 0.6758
model_list <-append(model_list,"ENmodelcv")
accuracy_list<-append(accuracy_list,ENpredictCVconfiusionmatrix$overall['Accuracy'])
kappa_list<-append(kappa_list,ENpredictCVconfiusionmatrix$overall['Kappa'])
table(PCa_test_diabetic_largedata$readmitted)
## 
##  0  1 
## 23 35
table(PCa_train_diabetic_largedata$readmitted)
## 
##   0   1 
##  93 138

#Cross validation on PCA data set

PCa_train_diabetic_largedata$readmitted<-as.factor(ifelse(PCa_train_diabetic_largedata$readmitted==0,"NO","YES"))
PCa_test_diabetic_largedata$readmitted<-as.factor(ifelse(PCa_test_diabetic_largedata$readmitted==0,"NO","YES"))
table(PCa_train_diabetic_largedata$readmitted)
## 
##  NO YES 
##  93 138
table(PCa_test_diabetic_largedata$readmitted)
## 
##  NO YES 
##  23  35
# Define models to compare
models <- c("glm", "rf", "knn","svmRadial")
# Train and evaluate models
results <- lapply(models, function(model) {
 train(readmitted ~ ., data = PCa_train_diabetic_largedata, method = model, trControl = TRC)
})

# Compare models using resamples()
resamples(results)
## 
## Call:
## resamples.default(x = results)
## 
## Models: Model1, Model2, Model3, Model4 
## Number of resamples: 5 
## Performance metrics: ROC, Sens, Spec 
## Time estimates for: everything, final model fit
results
## [[1]]
## Generalized Linear Model 
## 
## 231 samples
##  20 predictor
##   2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 184, 185, 185, 184, 186 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.6446053  0.5339181  0.7251323
## 
## 
## [[2]]
## Random Forest 
## 
## 231 samples
##  20 predictor
##   2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 184, 186, 184, 186, 184 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    2    0.5871786  0.2350877  0.8108466
##   11    0.5993974  0.3438596  0.7751323
##   20    0.5946270  0.3561404  0.7460317
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 11.
## 
## [[3]]
## k-Nearest Neighbors 
## 
## 231 samples
##  20 predictor
##   2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 185, 185, 185, 185, 184 
## Resampling results across tuning parameters:
## 
##   k  ROC        Sens       Spec     
##   5  0.5781746  0.4514620  0.6814815
##   7  0.5967697  0.3982456  0.7166667
##   9  0.6610833  0.4736842  0.7825397
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
## 
## [[4]]
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 231 samples
##  20 predictor
##   2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 185, 184, 185, 186, 184 
## Resampling results across tuning parameters:
## 
##   C     ROC        Sens       Spec     
##   0.25  0.6227041  0.2947368  0.8182540
##   0.50  0.6223143  0.2058480  0.8767196
##   1.00  0.5790093  0.1403509  0.9346561
## 
## Tuning parameter 'sigma' was held constant at a value of 0.03162296
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.03162296 and C = 0.25.

#Cross validaion in PCA: The ROC of the models on the PCA data is not that high/ the best performance on the PCA data is the KNN by k=9 with the ROC 66% so I didn’t conclud them in my final model comparision

#Conclusion(cmparision of the models)

modelscompare<-data.frame(model_list,accuracy_list,kappa_list)
modelscompare
##           model_list accuracy_list kappa_list
## 1            LRmodel     0.6206897 0.24137931
## 2           LRmodel2     0.6551724 0.33865450
## 3        PCALRmodel3     0.6206897 0.25205158
## 4         predictKNN     0.7758621 0.53514180
## 5      PCApredictKNN     0.7413793 0.44726811
## 6           DTmodel1     0.6206897 0.14362416
## 7           RFmodel1     0.7241379 0.41488020
## 8  XGboostclassifier     0.5344828 0.02002503
## 9         GLMmodelCV     0.6379310 0.26002430
## 10         RFmodelcv     0.6724138 0.28903226
## 11       KNNmodelCVF     0.7758621 0.53514180
## 12        SVMmodelCV     0.6724138 0.26631158
## 13         XGmodelCV     0.6034483 0.17755857
## 14         ENmodelcv     0.6896552 0.35155280
ggplot(modelscompare, aes(x = accuracy_list, y = model_list)) +
  geom_bar(stat = "identity", aes(fill = kappa_list)) +scale_fill_gradient(low = 'red',high='green')+
  xlab("Accuracy") +
  ylab("models") +
  ggtitle("Models comparision") +
  theme(plot.title = element_text(hjust = 0.5))

ACUlist<-c(LRmodelroc$auc,LRmodel2roc$auc,LRmodel3roc$auc,ELmodelroc$auc,predictKNN.roc$auc,PCApredictKNN.roc$auc,SVMmodel1.roc$auc,SVMmodel2.roc$auc,DTpredict1.roc$auc,DTpredict2.roc$auc,RFpredict1.roc$auc,RFpredict2.roc$auc,GLMmodelCVroc$auc,RFMmodelCVroc$auc,KNNpredictCVroc$auc,SVMmodelCVroc$auc,XGmodelCVroc$auc,ENmodelCVroc$auc)
modelnames <- c("LRmodel", "LRmodel2", "LRmodel3", "ELmodel", "predictKNN", "PCApredictKNN", "SVMmodel1", "SVMmodel2", "DTpredict1", "DTpredict2", "RFpredict1", "RFpredict2", "GLMmodelCV", "RFMmodelCV", "KNNpredictCV", "SVMmodelCV", "XGmodelCV", "ENmodelCV")
acucompare<-data.frame(modelnames, ACUlist)
acucompare
##       modelnames   ACUlist
## 1        LRmodel 0.7043478
## 2       LRmodel2 0.7180124
## 3       LRmodel3 0.6521739
## 4        ELmodel 0.6993789
## 5     predictKNN 0.7695652
## 6  PCApredictKNN 0.7186335
## 7      SVMmodel1 0.5000000
## 8      SVMmodel2 0.5000000
## 9     DTpredict1 0.5664596
## 10    DTpredict2 0.5242236
## 11    RFpredict1 0.7043478
## 12    RFpredict2 0.7329193
## 13    GLMmodelCV 0.6329193
## 14    RFMmodelCV 0.6391304
## 15  KNNpredictCV 0.7695652
## 16    SVMmodelCV 0.6242236
## 17     XGmodelCV 0.5894410
## 18     ENmodelCV 0.6757764
ggplot(acucompare, aes(x = ACUlist, y = modelnames, fill = ACUlist)) +
  geom_bar(stat = "identity") +
  scale_fill_gradient2(low = "red", mid = "yellow", high = "green", midpoint = 0.65)

#Initially, I performed some data cleaning and preprocessing steps on the dataset. after that I end up containing 45 predictors and 289 observations. I dropped variables that contained missing values, such as weight or the ones that didn't contain any special information like some ids.Moreover, removed rows with missing values.

#Next, I bucketized the diagnostic columns based on a provided table and transformed all categorical variables into dummy variables. I then split the data into test and train sets , and subsequently scaled the entire dataset to prevent overshadowing effects. While I tried using PCA for dimensionality reduction, the presence of dummy variables caused that  20 principal components could cover 80% of the data, leading to only limited dimensionality reduction. Nonetheless, I kept the PCA data for evaluating the model performance.
 
#To evaluate the performance of the supervised learning models, I applied them to both the PCA and non-PCA datasets, and employed 5-fold cross-validation. As the outcome variable was not imbalanced, I considered accuracy evaluation metrics for comparing the models.As well as AUC and Kappa. For checking which same models fit better to data like evaluating several logestic rigressions I compare their Akaike as well.
#After comparing the performance of various models, I found that the KNN model performed best on the non-PCA dataset with an AUC of 0.7696 and a precision of 73%. Although my best model(KNN) is based on Euclidean distance and it is hard to say which predictor was more important or play more important role in predicting the readmition of the patient, Based on other models like random forest2(accuracy 72% and AUC 69%) and logestic2 regression (accuracy 65%, AUC 71%) I found "Number of lab procedures " and "time in hospitals" important variables